Adding range-based detection (improved the results for Nest Thermostat and Arlo Camera.
[pingpong.git] / Code / Projects / PacketLevelSignatureExtractor / src / main / java / edu / uci / iotproject / detection / layer3 / Layer3ClusterMatcher.java
1 package edu.uci.iotproject.detection.layer3;
2
3 import edu.uci.iotproject.detection.AbstractClusterMatcher;
4 import edu.uci.iotproject.detection.ClusterMatcherObserver;
5 import edu.uci.iotproject.trafficreassembly.layer3.Conversation;
6 import edu.uci.iotproject.trafficreassembly.layer3.TcpReassembler;
7 import edu.uci.iotproject.analysis.TcpConversationUtils;
8 import edu.uci.iotproject.io.PcapHandleReader;
9 import edu.uci.iotproject.util.PrintUtils;
10 import org.pcap4j.core.*;
11
12 import java.time.ZoneId;
13 import java.util.*;
14 import java.util.stream.Collectors;
15
16 import static edu.uci.iotproject.util.PcapPacketUtils.*;
17
18 /**
19  * Searches a traffic trace for sequences of packets "belong to" a given cluster (in other words, attempts to classify
20  * traffic as pertaining to a given cluster).
21  *
22  * @author Janus Varmarken {@literal <jvarmark@uci.edu>}
23  * @author Rahmadi Trimananda {@literal <rtrimana@uci.edu>}
24  */
25 public class Layer3ClusterMatcher extends AbstractClusterMatcher implements PacketListener {
26
27     // Test client
28     public static void main(String[] args) throws PcapNativeException, NotOpenException {
29
30 //        String path = "/scratch/July-2018"; // Rahmadi
31 //        String path = "/Users/varmarken/temp/UCI IoT Project/experiments"; // Janus
32 //        final String inputPcapFile = path + "/2018-07/dlink/dlink.wlan1.local.pcap";
33 //        final String signatureFile = path + "/2018-07/dlink/offSignature1.sig";
34 //
35 //        List<List<PcapPacket>> signature = PrintUtils.deserializeClustersFromFile(signatureFile);
36 //        Layer3ClusterMatcher clusterMatcher = new Layer3ClusterMatcher(signature, null,
37 //                (sig, match) -> System.out.println(
38 //                        String.format("[ !!! SIGNATURE DETECTED AT %s !!! ]",
39 //                                match.get(0).getTimestamp().atZone(ZoneId.of("America/Los_Angeles")))
40 //                )
41 //        );
42 //
43 //        PcapHandle handle;
44 //        try {
45 //            handle = Pcaps.openOffline(inputPcapFile, PcapHandle.TimestampPrecision.NANO);
46 //        } catch (PcapNativeException pne) {
47 //            handle = Pcaps.openOffline(inputPcapFile);
48 //        }
49 //        PcapHandleReader reader = new PcapHandleReader(handle, p -> true, clusterMatcher);
50 //        reader.readFromHandle();
51 //        clusterMatcher.performDetection();
52     }
53
54     /**
55      * The ordered directions of packets in the sequences that make up {@link #mCluster}.
56      */
57     private final Conversation.Direction[] mClusterMemberDirections;
58
59     /**
60      * For reassembling the observed traffic into TCP connections.
61      */
62     private final TcpReassembler mTcpReassembler = new TcpReassembler();
63
64     /**
65      * IP of the router's WAN port (if analyzed traffic is captured at the ISP's point of view).
66      */
67     private final String mRouterWanIp;
68
69     /**
70      * Range-based vs. strict matching.
71      */
72     private final boolean mRangeBased;
73
74     /**
75      * Epsilon value used by the DBSCAN algorithm; it is used again for range-based matching here.
76      */
77     private final double mEps;
78
79     /**
80      * Create a {@link Layer3ClusterMatcher}.
81      * @param cluster The cluster that traffic is matched against.
82      * @param routerWanIp The router's WAN IP if examining traffic captured at the ISP's point of view (used for
83      *                    determining the direction of packets).
84      * @param isRangeBased The boolean that decides if it is range-based vs. strict matching.
85      * @param detectionObservers Client code that wants to get notified whenever the {@link Layer3ClusterMatcher} detects that
86      *                          (a subset of) the examined traffic is similar to the traffic that makes up
87      *                          {@code cluster}, i.e., when the examined traffic is classified as pertaining to
88      *                          {@code cluster}.
89      */
90     public Layer3ClusterMatcher(List<List<PcapPacket>> cluster, String routerWanIp, boolean isRangeBased, double eps,
91                                 ClusterMatcherObserver... detectionObservers) {
92         super(cluster, isRangeBased);
93         Objects.requireNonNull(detectionObservers, "detectionObservers cannot be null");
94         for (ClusterMatcherObserver obs : detectionObservers) {
95             addObserver(obs);
96         }
97         // Build the cluster members' direction sequence.
98         // Note: assumes that the provided cluster was captured within the local network (routerWanIp is set to null).
99         mClusterMemberDirections = getPacketDirections(cluster.get(0), null);
100         /*
101          * Enforce restriction on cluster members: all representatives must exhibit the same direction pattern and
102          * contain the same number of packets. Note that this is a somewhat heavy operation, so it may be disabled later
103          * on in favor of performance. However, it is only run once (at instantiation), so the overhead may be warranted
104          * in order to ensure correctness, especially during the development/debugging phase.
105          */
106         mRangeBased = isRangeBased;
107         if (!mRangeBased) {    // Only when it is not range-based
108             if (mCluster.stream().
109                     anyMatch(inner -> !Arrays.equals(mClusterMemberDirections, getPacketDirections(inner, null)))) {
110                 throw new IllegalArgumentException(
111                         "cluster members must contain the same number of packets and exhibit the same packet direction " +
112                                 "pattern"
113                 );
114             }
115         }
116         mEps = eps;
117         mRouterWanIp = routerWanIp;
118     }
119
120     @Override
121     public void gotPacket(PcapPacket packet) {
122         // Present packet to TCP reassembler so that it can be mapped to a connection (if it is a TCP packet).
123         mTcpReassembler.gotPacket(packet);
124     }
125
126     /**
127      * Get the cluster that describes the packet sequence that this {@link Layer3ClusterMatcher} is searching for.
128      * @return the cluster that describes the packet sequence that this {@link Layer3ClusterMatcher} is searching for.
129      */
130     public List<List<PcapPacket>> getCluster() {
131         return mCluster;
132     }
133
134     public void performDetectionRangeBased() {
135         /*
136          * Let's start out simple by building a version that only works for signatures that do not span across multiple
137          * TCP conversations...
138          */
139         for (Conversation c : mTcpReassembler.getTcpConversations()) {
140             if (c.isTls() && c.getTlsApplicationDataPackets().isEmpty() || !c.isTls() && c.getPackets().isEmpty()) {
141                 // Skip empty conversations.
142                 continue;
143             }
144             List<PcapPacket> lowerBound = mCluster.get(0);
145             List<PcapPacket> upperBound = mCluster.get(1);
146             if (isTlsSequence(lowerBound) != c.isTls() || isTlsSequence(upperBound) != c.isTls()) {
147                 // We consider it a mismatch if one is a TLS application data sequence and the other is not.
148                 continue;
149             }
150             // Fetch set of packets to examine based on TLS or not.
151             List<PcapPacket> cPkts = c.isTls() ? c.getTlsApplicationDataPackets() : c.getPackets();
152             Optional<List<PcapPacket>> match;
153             while ((match = findSubsequenceInSequence(lowerBound, upperBound, cPkts, mClusterMemberDirections, null)).
154                     isPresent()) {
155                 List<PcapPacket> matchSeq = match.get();
156                 // Notify observers about the match.
157                 mObservers.forEach(o -> o.onMatch(Layer3ClusterMatcher.this, matchSeq));
158                 /*
159                  * Get the index in cPkts of the last packet in the sequence of packets that matches the searched
160                  * signature sequence.
161                  */
162                 int matchSeqEndIdx = cPkts.indexOf(matchSeq.get(matchSeq.size() - 1));
163                 // We restart the search for the signature sequence immediately after that index, so truncate cPkts.
164                 cPkts = cPkts.stream().skip(matchSeqEndIdx + 1).collect(Collectors.toList());
165             }
166         }
167     }
168
169     public void performDetectionConservative() {
170         /*
171          * Let's start out simple by building a version that only works for signatures that do not span across multiple
172          * TCP conversations...
173          */
174         for (Conversation c : mTcpReassembler.getTcpConversations()) {
175             if (c.isTls() && c.getTlsApplicationDataPackets().isEmpty() || !c.isTls() && c.getPackets().isEmpty()) {
176                 // Skip empty conversations.
177                 continue;
178             }
179             for (List<PcapPacket> signatureSequence : mCluster) {
180                 if (isTlsSequence(signatureSequence) != c.isTls()) {
181                     // We consider it a mismatch if one is a TLS application data sequence and the other is not.
182                     continue;
183                 }
184                 // Fetch set of packets to examine based on TLS or not.
185                 List<PcapPacket> cPkts = c.isTls() ? c.getTlsApplicationDataPackets() : c.getPackets();
186                 /*
187                  * Note: we embed the attempt to detect the signature sequence in a loop in order to capture those cases
188                  * where the same signature sequence appears multiple times in one Conversation.
189                  *
190                  * Note: since we expect all sequences that together make up the signature to exhibit the same direction
191                  * pattern, we can simply pass the precomputed direction array for the signature sequence so that it
192                  * won't have to be recomputed internally in each call to findSubsequenceInSequence().
193                  */
194                 Optional<List<PcapPacket>> match;
195                 while ((match = findSubsequenceInSequence(signatureSequence, cPkts, mClusterMemberDirections, null)).
196                         isPresent()) {
197                     List<PcapPacket> matchSeq = match.get();
198                     // Notify observers about the match.
199                     mObservers.forEach(o -> o.onMatch(Layer3ClusterMatcher.this, matchSeq));
200                     /*
201                      * Get the index in cPkts of the last packet in the sequence of packets that matches the searched
202                      * signature sequence.
203                      */
204                     int matchSeqEndIdx = cPkts.indexOf(matchSeq.get(matchSeq.size() - 1));
205                     // We restart the search for the signature sequence immediately after that index, so truncate cPkts.
206                     cPkts = cPkts.stream().skip(matchSeqEndIdx + 1).collect(Collectors.toList());
207                 }
208             }
209
210             /*
211              * TODO:
212              * if no item in cluster matches, also perform a distance-based matching to cover those cases where we did
213              * not manage to capture every single mutation of the sequence during training.
214              *
215              * Need to compute average/centroid of cluster to do so...? Compute within-cluster variance, then check if
216              * distance between input conversation and cluster average/centroid is smaller than or equal to the computed
217              * variance?
218              */
219         }
220     }
221
222     /**
223      * Checks if {@code sequence} is a sequence of TLS packets. Note: the current implementation relies on inspection
224      * of the port numbers when deciding between TLS vs. non-TLS. Therefore, only the first packet of {@code sequence}
225      * is examined as it is assumed that all packets in {@code sequence} pertain to the same {@link Conversation} and
226      * hence share the same set of two src/dst port numbers (albeit possibly alternating between which one is the src
227      * and which one is the dst, as packets in {@code sequence} may be in alternating directions).
228      * @param sequence The sequence of packets for which it is to be determined if it is a sequence of TLS packets or
229      *                 non-TLS packets.
230      * @return {@code true} if {@code sequence} is a sequence of TLS packets, {@code false} otherwise.
231      */
232     private boolean isTlsSequence(List<PcapPacket> sequence) {
233         // NOTE: Assumes ALL packets in sequence pertain to the same TCP connection!
234         PcapPacket firstPkt = sequence.get(0);
235         int srcPort = getSourcePort(firstPkt);
236         int dstPort = getDestinationPort(firstPkt);
237         return TcpConversationUtils.isTlsPort(srcPort) || TcpConversationUtils.isTlsPort(dstPort);
238     }
239
240     /**
241      * Examine if a given sequence of packets ({@code sequence}) contains a given shorter sequence of packets
242      * ({@code subsequence}). Note: the current implementation actually searches for a substring as it does not allow
243      * for interleaving packets in {@code sequence} that are not in {@code subsequence}; for example, if
244      * {@code subsequence} consists of packet lengths [2, 3, 5] and {@code sequence} consists of  packet lengths
245      * [2, 3, 4, 5], the result will be that there is no match (because of the interleaving 4). If we are to allow
246      * interleaving packets, we need a modified version of
247      * <a href="https://stackoverflow.com/a/20545604/1214974">this</a>.
248      *
249      * @param subsequence The sequence to search for.
250      * @param sequence The sequence to search.
251      * @param subsequenceDirections The directions of packets in {@code subsequence} such that for all {@code i},
252      *                              {@code subsequenceDirections[i]} is the direction of the packet returned by
253      *                              {@code subsequence.get(i)}. May be set to {@code null}, in which this call will
254      *                              internally compute the packet directions.
255      * @param sequenceDirections The directions of packets in {@code sequence} such that for all {@code i},
256      *                           {@code sequenceDirections[i]} is the direction of the packet returned by
257      *                           {@code sequence.get(i)}. May be set to {@code null}, in which this call will internally
258      *                           compute the packet directions.
259      *
260      * @return An {@link Optional} containing the part of {@code sequence} that matches {@code subsequence}, or an empty
261      *         {@link Optional} if no part of {@code sequence} matches {@code subsequence}.
262      */
263     private Optional<List<PcapPacket>> findSubsequenceInSequence(List<PcapPacket> subsequence,
264                                                                  List<PcapPacket> sequence,
265                                                                  Conversation.Direction[] subsequenceDirections,
266                                                                  Conversation.Direction[] sequenceDirections) {
267         if (sequence.size() < subsequence.size()) {
268             // If subsequence is longer, it cannot be contained in sequence.
269             return Optional.empty();
270         }
271         if (isTlsSequence(subsequence) != isTlsSequence(sequence)) {
272             // We consider it a mismatch if one is a TLS application data sequence and the other is not.
273             return Optional.empty();
274         }
275         // If packet directions have not been precomputed by calling code, we need to construct them.
276         if (subsequenceDirections == null) {
277             subsequenceDirections = getPacketDirections(subsequence, mRouterWanIp);
278         }
279         if (sequenceDirections == null) {
280             sequenceDirections = getPacketDirections(sequence, mRouterWanIp);
281         }
282         int subseqIdx = 0;
283         int seqIdx = 0;
284         while (seqIdx < sequence.size()) {
285             PcapPacket subseqPkt = subsequence.get(subseqIdx);
286             PcapPacket seqPkt = sequence.get(seqIdx);
287             // We only have a match if packet lengths and directions match.
288             if (subseqPkt.getOriginalLength() == seqPkt.getOriginalLength() &&
289                     subsequenceDirections[subseqIdx] == sequenceDirections[seqIdx]) {
290                 // A match; advance both indices to consider next packet in subsequence vs. next packet in sequence.
291                 subseqIdx++;
292                 seqIdx++;
293                 if (subseqIdx == subsequence.size()) {
294                     // We managed to match the entire subsequence in sequence.
295                     // Return the sublist of sequence that matches subsequence.
296                     /*
297                      * TODO:
298                      * ASSUMES THE BACKING LIST (i.e., 'sequence') IS _NOT_ STRUCTURALLY MODIFIED, hence may not work
299                      * for live traces!
300                      */
301                     return Optional.of(sequence.subList(seqIdx - subsequence.size(), seqIdx));
302                 }
303             } else {
304                 // Mismatch.
305                 if (subseqIdx > 0) {
306                     /*
307                      * If we managed to match parts of subsequence, we restart the search for subsequence in sequence at
308                      * the index of sequence where the current mismatch occurred. I.e., we must reset subseqIdx, but
309                      * leave seqIdx untouched.
310                      */
311                     subseqIdx = 0;
312                 } else {
313                     /*
314                      * First packet of subsequence didn't match packet at seqIdx of sequence, so we move forward in
315                      * sequence, i.e., we continue the search for subsequence in sequence starting at index seqIdx+1 of
316                      * sequence.
317                      */
318                     seqIdx++;
319                 }
320             }
321         }
322         return Optional.empty();
323     }
324
325     /**
326      * Overloading the method {@code findSubsequenceInSequence} for range-based matching. Instead of a sequence,
327      * we have sequences of lower and upper bounds.
328      *
329      * @param lowerBound The lower bound of the sequence we search for.
330      * @param upperBound The upper bound of the sequence we search for.
331      * @param subsequenceDirections The directions of packets in {@code subsequence} such that for all {@code i},
332      *                              {@code subsequenceDirections[i]} is the direction of the packet returned by
333      *                              {@code subsequence.get(i)}. May be set to {@code null}, in which this call will
334      *                              internally compute the packet directions.
335      * @param sequenceDirections The directions of packets in {@code sequence} such that for all {@code i},
336      *                           {@code sequenceDirections[i]} is the direction of the packet returned by
337      *                           {@code sequence.get(i)}. May be set to {@code null}, in which this call will internally
338      *                           compute the packet directions.
339      *
340      * @return An {@link Optional} containing the part of {@code sequence} that matches {@code subsequence}, or an empty
341      *         {@link Optional} if no part of {@code sequence} matches {@code subsequence}.
342      */
343     private Optional<List<PcapPacket>> findSubsequenceInSequence(List<PcapPacket> lowerBound,
344                                                                  List<PcapPacket> upperBound,
345                                                                  List<PcapPacket> sequence,
346                                                                  Conversation.Direction[] subsequenceDirections,
347                                                                  Conversation.Direction[] sequenceDirections) {
348         // Just do the checks for either lower or upper bound!
349         // TODO: For now we use just the lower bound
350         if (sequence.size() < lowerBound.size()) {
351             // If subsequence is longer, it cannot be contained in sequence.
352             return Optional.empty();
353         }
354         if (isTlsSequence(lowerBound) != isTlsSequence(sequence)) {
355             // We consider it a mismatch if one is a TLS application data sequence and the other is not.
356             return Optional.empty();
357         }
358         // If packet directions have not been precomputed by calling code, we need to construct them.
359         if (subsequenceDirections == null) {
360             subsequenceDirections = getPacketDirections(lowerBound, mRouterWanIp);
361         }
362         if (sequenceDirections == null) {
363             sequenceDirections = getPacketDirections(sequence, mRouterWanIp);
364         }
365         int subseqIdx = 0;
366         int seqIdx = 0;
367         while (seqIdx < sequence.size()) {
368             PcapPacket lowBndPkt = lowerBound.get(subseqIdx);
369             PcapPacket upBndPkt = upperBound.get(subseqIdx);
370             PcapPacket seqPkt = sequence.get(seqIdx);
371             // We only have a match if packet lengths and directions match.
372             // The packet lengths have to be in the range of [lowerBound - eps, upperBound+eps]
373             // TODO: Maybe we could do better here for the double to integer conversion?
374             int epsLowerBound = lowBndPkt.length() - (int) mEps;
375             int epsUpperBound = upBndPkt.length() + (int) mEps;
376             if (epsLowerBound <= seqPkt.getOriginalLength() &&
377                     seqPkt.getOriginalLength() <= epsUpperBound &&
378                     subsequenceDirections[subseqIdx] == sequenceDirections[seqIdx]) {
379                 // A match; advance both indices to consider next packet in subsequence vs. next packet in sequence.
380                 subseqIdx++;
381                 seqIdx++;
382                 if (subseqIdx == lowerBound.size()) {
383                     // We managed to match the entire subsequence in sequence.
384                     // Return the sublist of sequence that matches subsequence.
385                     /*
386                      * TODO:
387                      * ASSUMES THE BACKING LIST (i.e., 'sequence') IS _NOT_ STRUCTURALLY MODIFIED, hence may not work
388                      * for live traces!
389                      */
390                     return Optional.of(sequence.subList(seqIdx - lowerBound.size(), seqIdx));
391                 }
392             } else {
393                 // Mismatch.
394                 if (subseqIdx > 0) {
395                     /*
396                      * If we managed to match parts of subsequence, we restart the search for subsequence in sequence at
397                      * the index of sequence where the current mismatch occurred. I.e., we must reset subseqIdx, but
398                      * leave seqIdx untouched.
399                      */
400                     subseqIdx = 0;
401                 } else {
402                     /*
403                      * First packet of subsequence didn't match packet at seqIdx of sequence, so we move forward in
404                      * sequence, i.e., we continue the search for subsequence in sequence starting at index seqIdx+1 of
405                      * sequence.
406                      */
407                     seqIdx++;
408                 }
409             }
410         }
411         return Optional.empty();
412     }
413
414     /**
415      * Given a cluster, produces a pruned version of that cluster. In the pruned version, there are no duplicate cluster
416      * members. Two cluster members are considered identical if their packets lengths and packet directions are
417      * identical. The resulting pruned cluster is unmodifiable (this applies to both the outermost list as well as the
418      * nested lists) in order to preserve its integrity when exposed to external code (e.g., through
419      * {@link #getCluster()}).
420      *
421      * @param cluster A cluster to prune.
422      * @return The resulting pruned cluster.
423      */
424     @Override
425     protected List<List<PcapPacket>> pruneCluster(List<List<PcapPacket>> cluster) {
426         List<List<PcapPacket>> prunedCluster = new ArrayList<>();
427         for (List<PcapPacket> originalClusterSeq : cluster) {
428             boolean alreadyPresent = false;
429             for (List<PcapPacket> prunedClusterSeq : prunedCluster) {
430                 Optional<List<PcapPacket>> duplicate = findSubsequenceInSequence(originalClusterSeq, prunedClusterSeq,
431                         mClusterMemberDirections, mClusterMemberDirections);
432                 if (duplicate.isPresent()) {
433                     alreadyPresent = true;
434                     break;
435                 }
436             }
437             if (!alreadyPresent) {
438                 prunedCluster.add(Collections.unmodifiableList(originalClusterSeq));
439             }
440         }
441         return Collections.unmodifiableList(prunedCluster);
442     }
443
444     /**
445      * Given a {@code List<PcapPacket>}, generate a {@code Conversation.Direction[]} such that each entry in the
446      * resulting {@code Conversation.Direction[]} specifies the direction of the {@link PcapPacket} at the corresponding
447      * index in the input list.
448      * @param packets The list of packets for which to construct a corresponding array of packet directions.
449      * @param routerWanIp The IP of the router's WAN port. This is used for determining the direction of packets when
450      *                    the traffic is captured just outside the local network (at the ISP side of the router). Set to
451      *                    {@code null} if {@code packets} stem from traffic captured within the local network.
452      * @return A {@code Conversation.Direction[]} specifying the direction of the {@link PcapPacket} at the
453      *         corresponding index in {@code packets}.
454      */
455     private static Conversation.Direction[] getPacketDirections(List<PcapPacket> packets, String routerWanIp) {
456         Conversation.Direction[] directions = new Conversation.Direction[packets.size()];
457         for (int i = 0; i < packets.size(); i++) {
458             PcapPacket pkt = packets.get(i);
459             if (getSourceIp(pkt).equals(getDestinationIp(pkt))) {
460                 // Sanity check: we shouldn't be processing loopback traffic
461                 throw new AssertionError("loopback traffic detected");
462             }
463             if (isSrcIpLocal(pkt) || getSourceIp(pkt).equals(routerWanIp)) {
464                 directions[i] = Conversation.Direction.CLIENT_TO_SERVER;
465             } else if (isDstIpLocal(pkt) || getDestinationIp(pkt).equals(routerWanIp)) {
466                 directions[i] = Conversation.Direction.SERVER_TO_CLIENT;
467             } else {
468                 //throw new IllegalArgumentException("no local IP or router WAN port IP found, can't detect direction");
469             }
470         }
471         return directions;
472     }
473
474 }