4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
24 import parser.parse_dns
27 DEVICE_MAC_LIST = "devicelist.dat"
28 EXCLUSION_MAC_LIST = "exclusion.dat"
29 COLUMN_MAC = "MAC_address"
30 COLUMN_DEVICE_NAME = "device_name"
32 JSON_KEY_SOURCE = "_source"
33 JSON_KEY_LAYERS = "layers"
34 JSON_KEY_FRAME = "frame"
35 JSON_KEY_FRAME_PROTOCOLS = "frame.protocols"
36 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
38 JSON_KEY_ETH_SRC = "eth.src"
39 JSON_KEY_ETH_DST = "eth.dst"
41 JSON_KEY_IP_SRC = "ip.src"
42 JSON_KEY_IP_DST = "ip.dst"
46 # List of checked protocols
47 listchkprot = [ "arp",
56 def create_device_list(dev_list_file):
57 """ Create list for smart home devices from a CSV file
59 dev_list_file: CSV file path that contains list of device MAC addresses
61 # Open the device MAC list file
62 with open(dev_list_file) as csvfile:
63 mac_list = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
66 crude_list.append(item)
67 # Create key-value dictionary
69 for item in crude_list:
70 dev_list[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
71 #print item["MAC_address"] + " => " + item["device_name"]
72 #for key, value in devlist.iteritems():
73 # print key + " => " + value
78 def parse_json(file_path):
80 # Create a smart home device list
81 dev_list = create_device_list(DEVICE_MAC_LIST)
82 # Create an exclusion list
83 exc_list = create_device_list(EXCLUSION_MAC_LIST)
85 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
86 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json"
90 # Mapping from node to a set of protocols
93 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
94 with open(file_path) as jf:
96 # data becomes reference to root JSON object (or in our case json array)
99 # Loop through json objects (packets) in data
101 # p is a JSON object, not an index
102 # Drill down to object containing data from the different layers
103 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
106 for prot in listchkprot:
112 # Skip any non udp/non tcp traffic
113 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
116 # Fetch source and destination MACs
117 eth = layers.get(JSON_KEY_ETH, None)
119 print "[ WARNING: eth data not found ]"
121 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
122 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
123 # Exclude devices in the exclusion list
124 if eth_src in exc_list:
125 print "[ WARNING: Source ", eth_src, " is excluded from graph! ]"
127 if eth_dst in exc_list:
128 print "[ WARNING: Destination ", eth_dst, " is excluded from graph! ]"
131 # Fetch timestamp of packet (router's timestamp)
132 timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
133 # Get the protocol and strip just the name of it
134 long_protocol = layers[JSON_KEY_FRAME][JSON_KEY_FRAME_PROTOCOLS]
135 # Split once starting from the end of the string and get it
136 protocol = long_protocol.rsplit(':', 1)[1]
137 print "timestamp: ", timestamp, "\n"
139 # Store protocol into the set (source)
142 if eth_src not in node2prot:
143 node2prot[eth_src] = set()
144 src_protocols = node2prot[eth_src]
145 src_protocols.add(protocol)
146 src_protocols_str = ', '.join(src_protocols)
147 print "source protocols: ", src_protocols_str, "\n"
148 # Store protocol into the set (destination)
149 if eth_dst not in node2prot:
150 node2prot[eth_dst] = set()
151 dst_protocols = node2prot[eth_dst]
152 dst_protocols.add(protocol)
153 dst_protocols_str = ', '.join(dst_protocols)
154 print "destination protocols: ", dst_protocols_str, "\n"
155 # And source and destination IPs
156 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
157 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
159 # Categorize source and destination IP addresses: local vs. non-local
160 ipre = re.compile(r'\b192.168.[0-9.]+')
161 src_is_local = ipre.search(ip_src)
162 dst_is_local = ipre.search(ip_dst)
163 print "ip.src =", ip_src, "ip.dst =", ip_dst, "\n"
168 G.add_node(eth_src, Name=dev_list[eth_src], Protocol=src_protocols_str)
172 # Check first if the key (eth_dst) exists in the dictionary
173 if eth_dst in device_dns_mappings:
174 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
175 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, timestamp)
177 # Use IP if no hostname mapping
179 G.add_node(hostname, Protocol=src_protocols_str)
183 G.add_node(eth_dst, Name=dev_list[eth_dst], Protocol=dst_protocols_str)
187 # Check first if the key (eth_dst) exists in the dictionary
188 if eth_src in device_dns_mappings:
189 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
190 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, timestamp)
192 # Use IP if no hostname mapping
194 G.add_node(hostname, Protocol=dst_protocols_str)
196 G.add_edge(src_node, dst_node)
198 # Print DNS mapping for reference
199 for mac in device_dns_mappings:
200 ddm = device_dns_mappings[mac]
206 # ------------------------------------------------------
207 # Not currently used.
208 # Might be useful later on if we wish to resolve IPs.
209 def get_domain(host):
210 ext_result = tldextract.extract(str(host))
211 # Be consistent with ReCon and keep suffix
212 domain = ext_result.domain + "." + ext_result.suffix
217 socket.inet_aton(addr)
221 # ------------------------------------------------------
224 if __name__ == '__main__':
225 if len(sys.argv) < 3:
226 print "Usage:", sys.argv[0], "input_file output_file"
227 print "outfile_file should end in .gexf"
229 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
230 input_file = sys.argv[1]
231 print "[ input_file =", input_file, "]"
232 # Output file: Path to file where the Gephi XML should be written.
233 output_file = sys.argv[2]
234 print "[ output_file =", output_file, "]"
235 # Construct graph from JSON
236 G = parse_json(input_file)
237 # Write Graph in Graph Exchange XML format
238 nx.write_gexf(G, output_file)