4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 Update per February 2, 2018:
10 Extension of base_gefx_generator.py.
11 This script constructs a bipartite graph with IoT devices on one side and Internet hosts on the other side.
12 As a result, this graph does NOT show inter IoT device communication.
14 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
16 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
17 It serves as a baseline for future scripts that want to include more information in the graph.
27 import parser.parse_dns
29 from networkx.algorithms import bipartite
32 DEVICE_MAC_LIST = "devicelist.dat"
33 EXCLUSION_MAC_LIST = "exclusion.dat"
34 COLUMN_MAC = "MAC_address"
35 COLUMN_DEVICE_NAME = "device_name"
37 JSON_KEY_SOURCE = "_source"
38 JSON_KEY_LAYERS = "layers"
39 JSON_KEY_FRAME = "frame"
40 JSON_KEY_FRAME_PROTOCOLS = "frame.protocols"
41 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
42 JSON_KEY_FRAME_LENGTH = "frame.len"
44 JSON_KEY_ETH_SRC = "eth.src"
45 JSON_KEY_ETH_DST = "eth.dst"
46 JSON_KEY_IPV6 = "ipv6"
48 JSON_KEY_IP_SRC = "ip.src"
49 JSON_KEY_IP_DST = "ip.dst"
53 # List of checked protocols
54 listchkprot = [ "arp",
62 # Switch to generate graph that only shows local communication
63 ONLY_INCLUDE_LOCAL_COMMUNICATION = False
66 def create_device_list(dev_list_file):
67 """ Create list for smart home devices from a CSV file
69 dev_list_file: CSV file path that contains list of device MAC addresses
71 # Open the device MAC list file
72 with open(dev_list_file) as csvfile:
73 mac_list = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
76 crude_list.append(item)
77 # Create key-value dictionary
79 for item in crude_list:
80 dev_list[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
81 #print item["MAC_address"] + " => " + item["device_name"]
82 #for key, value in devlist.iteritems():
83 # print key + " => " + value
88 def traverse_and_merge_nodes(G, dev_list_file):
89 """ Merge nodes that have similar properties, e.g. same protocols
90 But, we only do this for leaves (outer nodes), and not for
91 nodes that are in the middle/have many neighbors.
92 The pre-condition is that the node:
93 (1) only has one neighbor, and
94 (2) not a smarthome device.
95 then we compare the edges, whether they use the same protocols
96 or not. If yes, then we collapse that node and we attach
97 it to the very first node that uses that set of protocols.
99 G: a complete networkx graph
100 dev_list_file: CSV file path that contains list of device MAC addresses
103 #print "Nodes: ", nodes
104 node_to_merge = dict()
105 # Create list of smarthome devices
106 dev_list = create_device_list(DEVICE_MAC_LIST)
107 # Traverse every node
108 # Check that the node is not a smarthome device
109 for node in list(nodes):
110 neighbors = G[node] #G.neighbors(node)
111 #print "Neighbors: ", neighbors, "\n"
112 # Skip if the node is a smarthome device
115 # Skip if the node has many neighbors (non-leaf) or no neighbor at all
116 if len(neighbors) is not 1:
118 #print "Node: ", node
119 neighbor = neighbors.keys()[0] #neighbors[0]
120 #print "Neighbor: ", neighbors
121 protocols = G[node][neighbor]['Protocol']
122 #print "Protocol: ", protocols
123 # Store neighbor-protocol as key in dictionary
124 neigh_proto = neighbor + "-" + protocols
125 if neigh_proto not in node_to_merge:
126 node_to_merge[neigh_proto] = node
128 # Merge this node if there is already an entry
131 node_to_merge_with = node_to_merge[neigh_proto]
132 merged_nodes = G.node[node_to_merge_with]['Merged']
133 # Check if this is the first node
134 if merged_nodes is '':
137 # Put comma if there is already one or more nodes
138 merged_nodes += ", " + node
139 # Then attach as attribute
140 G.node[node_to_merge_with]['Merged'] = merged_nodes
145 def place_in_graph(G, eth_src, eth_dst, device_dns_mappings, dev_list, layers,
146 edge_to_prot, edge_to_vol):
147 """ Place nodes and edges on the graph
149 G: the complete graph
150 eth_src: MAC address of source
151 eth_dst: MAC address of destination
152 device_dns_mappings: device to DNS mappings (data structure)
153 dev_list: list of existing smarthome devices
154 layers: layers of JSON file structure
155 edge_to_prot: edge to protocols mappings
156 edge_to_vol: edge to traffic volume mappings
158 # Get timestamp of packet (router's timestamp)
159 timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
161 packet_len = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_LENGTH])
162 # Get the protocol and strip just the name of it
163 long_protocol = layers[JSON_KEY_FRAME][JSON_KEY_FRAME_PROTOCOLS]
164 # Split once starting from the end of the string and get it
165 split_protocol = long_protocol.split(':')
167 if len(split_protocol) < 5:
168 last_index = len(split_protocol) - 1
169 protocol = split_protocol[last_index]
171 protocol = split_protocol[3] + ":" + split_protocol[4]
172 #print "timestamp: ", timestamp, " - new protocol added: ", protocol, "\n"
173 # And source and destination IPs
174 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
175 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
176 # Categorize source and destination IP addresses: local vs. non-local
177 #ip_re = re.compile(r'\b192.168.[0-9.]+')
178 ip_re = re.compile(r'\b192.168.1.[0-9.]+')
179 src_is_local = ip_re.search(ip_src)
180 dst_is_local = ip_re.search(ip_dst)
181 # Store protocol into the set (source)
183 # Key to search in the dictionary is <src-mac-address>-<dst-mac_address>
184 dict_key = ip_src + "-" + ip_dst
185 #print "Key: ", dict_key
186 if dict_key not in edge_to_prot:
187 edge_to_prot[dict_key] = set()
188 protocols = edge_to_prot[dict_key]
189 protocols.add(protocol)
190 protocols_str = ', '.join(protocols)
191 #print "protocols: ", protocols_str, "\n"
192 # Check packet length and accumulate to get traffic volume
193 if dict_key not in edge_to_vol:
194 edge_to_vol[dict_key] = 0;
195 edge_to_vol[dict_key] = edge_to_vol[dict_key] + packet_len
196 volume = str(edge_to_vol[dict_key])
198 # Skip device to cloud communication if we are interested in the local graph.
199 # TODO should this go before the protocol dict is changed?
200 if ONLY_INCLUDE_LOCAL_COMMUNICATION and not (src_is_local and dst_is_local):
203 #print "ip.src =", ip_src, "ip.dst =", ip_dst, "\n"
204 # Place nodes and edges
207 # Integer values used for tagging nodes, indicating to Gephi if they are local IoT devices or web servers.
210 # Values for the 'bipartite' attribute of a node when constructing the bipartite graph
212 bipartite_web_server = 1
214 G.add_node(eth_src, Name=dev_list[eth_src], islocal=local_node, bipartite=bipartite_iot)
218 # Check first if the key (eth_dst) exists in the dictionary
219 if eth_dst in device_dns_mappings:
220 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
221 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, timestamp)
223 # Use IP if no hostname mapping
225 # Non-smarthome devices can be merged later
226 G.add_node(hostname, Merged='', islocal=remote_node, bipartite=bipartite_web_server)
230 G.add_node(eth_dst, Name=dev_list[eth_dst], islocal=local_node, bipartite=bipartite_iot)
234 # Check first if the key (eth_dst) exists in the dictionary
235 if eth_src in device_dns_mappings:
236 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
237 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, timestamp)
239 # Use IP if no hostname mapping
241 # Non-smarthome devices can be merged later
242 G.add_node(hostname, Merged='', islocal=remote_node, bipartite=bipartite_web_server)
244 G.add_edge(src_node, dst_node, Protocol=protocols_str, Volume=volume)
247 def parse_json(file_path):
248 """ Parse JSON file and create graph
250 file_path: path to the JSON file
252 # Create a smart home device list
253 dev_list = create_device_list(DEVICE_MAC_LIST)
254 # Create an exclusion list
255 exc_list = create_device_list(EXCLUSION_MAC_LIST)
256 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
257 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path)
260 # Mapping from edge to a set of protocols
261 edge_to_prot = dict()
262 # Mapping from edge to traffic volume
264 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
266 with open(file_path) as jf:
267 # Read JSON; data becomes reference to root JSON object (or in our case json array)
269 # Loop through json objects (packets) in data
271 # p is a JSON object, not an index - drill down to object containing data from the different layers
272 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
275 for prot in listchkprot:
281 # Skip any non udp/non tcp traffic
282 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
285 # Fetch source and destination MACs
286 eth = layers.get(JSON_KEY_ETH, None)
288 print "[ WARNING: eth data not found ]"
290 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
291 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
292 # Exclude devices in the exclusion list
293 if eth_src in exc_list:
294 print "[ WARNING: Source ", eth_src, " is excluded from graph! ]"
296 if eth_dst in exc_list:
297 print "[ WARNING: Destination ", eth_dst, " is excluded from graph! ]"
299 # Exclude if IP does not exist in layers - this means IPv6
300 if JSON_KEY_IP not in layers and JSON_KEY_IPV6 in layers:
303 # Place nodes and edges in graph
304 place_in_graph(G, eth_src, eth_dst, device_dns_mappings, dev_list, layers,
305 edge_to_prot, edge_to_vol)
307 # Print DNS mapping for reference
308 #for mac in device_dns_mappings:
309 # ddm = device_dns_mappings[mac]
310 # ddm.print_mappings()
315 # ------------------------------------------------------
316 # Not currently used.
317 # Might be useful later on if we wish to resolve IPs.
318 def get_domain(host):
319 ext_result = tldextract.extract(str(host))
320 # Be consistent with ReCon and keep suffix
321 domain = ext_result.domain + "." + ext_result.suffix
326 socket.inet_aton(addr)
330 # ------------------------------------------------------
333 if __name__ == '__main__':
334 if len(sys.argv) < 3:
335 print "Usage:", sys.argv[0], "input_file output_file"
336 print "outfile_file should end in .gexf"
338 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
339 input_file = sys.argv[1]
340 print "[ input_file =", input_file, "]"
341 # Output file: Path to file where the Gephi XML should be written.
342 output_file = sys.argv[2]
343 print "[ output_file =", output_file, "]"
344 # Construct graph from JSON
345 G = parse_json(input_file)
346 # Contract nodes that have the same properties, i.e. same protocols
347 G = traverse_and_merge_nodes(G, DEVICE_MAC_LIST)
348 # Write Graph in Graph Exchange XML format
349 nx.write_gexf(G, output_file)