4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
24 import parser.parse_dns
27 DEVICE_MAC_LIST = "devicelist.dat"
28 EXCLUSION_MAC_LIST = "exclusion.dat"
29 COLUMN_MAC = "MAC_address"
30 COLUMN_DEVICE_NAME = "device_name"
32 JSON_KEY_SOURCE = "_source"
33 JSON_KEY_LAYERS = "layers"
34 JSON_KEY_FRAME = "frame"
35 JSON_KEY_FRAME_PROTOCOLS = "frame.protocols"
36 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
37 JSON_KEY_FRAME_LENGTH = "frame.len"
39 JSON_KEY_ETH_SRC = "eth.src"
40 JSON_KEY_ETH_DST = "eth.dst"
42 JSON_KEY_IP_SRC = "ip.src"
43 JSON_KEY_IP_DST = "ip.dst"
47 # List of checked protocols
48 listchkprot = [ "arp",
56 # Switch to generate graph that only shows local communication
57 ONLY_INCLUDE_LOCAL_COMMUNICATION = True
60 def create_device_list(dev_list_file):
61 """ Create list for smart home devices from a CSV file
63 dev_list_file: CSV file path that contains list of device MAC addresses
65 # Open the device MAC list file
66 with open(dev_list_file) as csvfile:
67 mac_list = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
70 crude_list.append(item)
71 # Create key-value dictionary
73 for item in crude_list:
74 dev_list[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
75 #print item["MAC_address"] + " => " + item["device_name"]
76 #for key, value in devlist.iteritems():
77 # print key + " => " + value
82 def traverse_and_merge_nodes(G, dev_list_file):
83 """ Merge nodes that have similar properties, e.g. same protocols
84 But, we only do this for leaves (outer nodes), and not for
85 nodes that are in the middle/have many neighbors.
86 The pre-condition is that the node:
87 (1) only has one neighbor, and
88 (2) not a smarthome device.
89 then we compare the edges, whether they use the same protocols
90 or not. If yes, then we collapse that node and we attach
91 it to the very first node that uses that set of protocols.
93 G: a complete networkx graph
94 dev_list_file: CSV file path that contains list of device MAC addresses
97 #print "Nodes: ", nodes
98 node_to_merge = dict()
99 # Create list of smarthome devices
100 dev_list = create_device_list(DEVICE_MAC_LIST)
101 # Traverse every node
102 # Check that the node is not a smarthome device
104 neighbors = G[node] #G.neighbors(node)
105 #print "Neighbors: ", neighbors, "\n"
106 # Skip if the node is a smarthome device
109 # Skip if the node has many neighbors (non-leaf) or no neighbor at all
110 if len(neighbors) is not 1:
112 #print "Node: ", node
113 neighbor = neighbors.keys()[0] #neighbors[0]
114 #print "Neighbor: ", neighbors
115 protocols = G[node][neighbor]['Protocol']
116 #print "Protocol: ", protocols
117 # Store neighbor-protocol as key in dictionary
118 neigh_proto = neighbor + "-" + protocols
119 if neigh_proto not in node_to_merge:
120 node_to_merge[neigh_proto] = node
122 # Merge this node if there is already an entry
125 node_to_merge_with = node_to_merge[neigh_proto]
126 merged_nodes = G.node[node_to_merge_with]['Merged']
127 # Check if this is the first node
128 if merged_nodes is '':
131 # Put comma if there is already one or more nodes
132 merged_nodes += ", " + node
133 # Then attach as attribute
134 G.node[node_to_merge_with]['Merged'] = merged_nodes
139 def place_in_graph(G, eth_src, eth_dst, device_dns_mappings, dev_list, layers,
140 edge_to_prot, edge_to_vol):
141 """ Place nodes and edges on the graph
143 G: the complete graph
144 eth_src: MAC address of source
145 eth_dst: MAC address of destination
146 device_dns_mappings: device to DNS mappings (data structure)
147 dev_list: list of existing smarthome devices
148 layers: layers of JSON file structure
149 edge_to_prot: edge to protocols mappings
150 edge_to_vol: edge to traffic volume mappings
152 # Get timestamp of packet (router's timestamp)
153 timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
155 packet_len = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_LENGTH])
156 # Get the protocol and strip just the name of it
157 long_protocol = layers[JSON_KEY_FRAME][JSON_KEY_FRAME_PROTOCOLS]
158 # Split once starting from the end of the string and get it
159 split_protocol = long_protocol.split(':')
161 if len(split_protocol) < 5:
162 last_index = len(split_protocol) - 1
163 protocol = split_protocol[last_index]
165 protocol = split_protocol[3] + ":" + split_protocol[4]
166 #print "timestamp: ", timestamp, " - new protocol added: ", protocol, "\n"
167 # Store protocol into the set (source)
169 # Key to search in the dictionary is <src-mac-address>-<dst-mac_address>
170 dict_key = eth_src + "-" + eth_dst
171 if dict_key not in edge_to_prot:
172 edge_to_prot[dict_key] = set()
173 protocols = edge_to_prot[dict_key]
174 protocols.add(protocol)
175 protocols_str = ', '.join(protocols)
176 #print "protocols: ", protocols_str, "\n"
177 # Check packet length and accumulate to get traffic volume
178 if dict_key not in edge_to_vol:
179 edge_to_vol[dict_key] = 0;
180 edge_to_vol[dict_key] = edge_to_vol[dict_key] + packet_len
181 volume = str(edge_to_vol[dict_key])
182 # And source and destination IPs
183 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
184 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
185 # Categorize source and destination IP addresses: local vs. non-local
186 ip_re = re.compile(r'\b192.168.[0-9.]+')
187 src_is_local = ip_re.search(ip_src)
188 dst_is_local = ip_re.search(ip_dst)
190 # Skip device to cloud communication if we are interested in the local graph.
191 # TODO should this go before the protocol dict is changed?
192 if ONLY_INCLUDE_LOCAL_COMMUNICATION and not (src_is_local and dst_is_local):
195 #print "ip.src =", ip_src, "ip.dst =", ip_dst, "\n"
196 # Place nodes and edges
199 # Integer values used for tagging nodes, indicating to Gephi if they are local IoT devices or web servers.
203 G.add_node(eth_src, Name=dev_list[eth_src], islocal=local_node)
207 # Check first if the key (eth_dst) exists in the dictionary
208 if eth_dst in device_dns_mappings:
209 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
210 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, timestamp)
212 # Use IP if no hostname mapping
214 # Non-smarthome devices can be merged later
215 G.add_node(hostname, Merged='', islocal=remote_node)
219 G.add_node(eth_dst, Name=dev_list[eth_dst], islocal=local_node)
223 # Check first if the key (eth_dst) exists in the dictionary
224 if eth_src in device_dns_mappings:
225 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
226 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, timestamp)
228 # Use IP if no hostname mapping
230 # Non-smarthome devices can be merged later
231 G.add_node(hostname, Merged='', islocal=remote_node)
233 G.add_edge(src_node, dst_node, Protocol=protocols_str, Volume=volume)
236 def parse_json(file_path):
237 """ Parse JSON file and create graph
239 file_path: path to the JSON file
241 # Create a smart home device list
242 dev_list = create_device_list(DEVICE_MAC_LIST)
243 # Create an exclusion list
244 exc_list = create_device_list(EXCLUSION_MAC_LIST)
245 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
246 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json"
249 # Mapping from edge to a set of protocols
250 edge_to_prot = dict()
251 # Mapping from edge to traffic volume
253 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
254 with open(file_path) as jf:
255 # Read JSON; data becomes reference to root JSON object (or in our case json array)
257 # Loop through json objects (packets) in data
259 # p is a JSON object, not an index - drill down to object containing data from the different layers
260 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
263 for prot in listchkprot:
269 # Skip any non udp/non tcp traffic
270 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
273 # Fetch source and destination MACs
274 eth = layers.get(JSON_KEY_ETH, None)
276 print "[ WARNING: eth data not found ]"
278 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
279 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
280 # Exclude devices in the exclusion list
281 if eth_src in exc_list:
282 print "[ WARNING: Source ", eth_src, " is excluded from graph! ]"
284 if eth_dst in exc_list:
285 print "[ WARNING: Destination ", eth_dst, " is excluded from graph! ]"
288 # Place nodes and edges in graph
289 place_in_graph(G, eth_src, eth_dst, device_dns_mappings, dev_list, layers,
290 edge_to_prot, edge_to_vol)
292 # Print DNS mapping for reference
293 #for mac in device_dns_mappings:
294 # ddm = device_dns_mappings[mac]
295 # ddm.print_mappings()
300 # ------------------------------------------------------
301 # Not currently used.
302 # Might be useful later on if we wish to resolve IPs.
303 def get_domain(host):
304 ext_result = tldextract.extract(str(host))
305 # Be consistent with ReCon and keep suffix
306 domain = ext_result.domain + "." + ext_result.suffix
311 socket.inet_aton(addr)
315 # ------------------------------------------------------
318 if __name__ == '__main__':
319 if len(sys.argv) < 3:
320 print "Usage:", sys.argv[0], "input_file output_file"
321 print "outfile_file should end in .gexf"
323 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
324 input_file = sys.argv[1]
325 print "[ input_file =", input_file, "]"
326 # Output file: Path to file where the Gephi XML should be written.
327 output_file = sys.argv[2]
328 print "[ output_file =", output_file, "]"
329 # Construct graph from JSON
330 G = parse_json(input_file)
331 # Contract nodes that have the same properties, i.e. same protocols
332 #G = traverse_and_merge_nodes(G, DEVICE_MAC_LIST)
333 # Write Graph in Graph Exchange XML format
334 nx.write_gexf(G, output_file)