From: rtrimana Date: Thu, 9 Nov 2017 16:51:16 +0000 (-0800) Subject: Establishing basic flow for the complete graph processing X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=d7f125b817d20937c580b48286c712491223e878;p=pingpong.git Establishing basic flow for the complete graph processing --- diff --git a/base_gefx_generator.py b/base_gefx_generator.py index aa27905..4031e0c 100644 --- a/base_gefx_generator.py +++ b/base_gefx_generator.py @@ -38,13 +38,16 @@ JSON_KEY_ETH_DST = "eth.dst" JSON_KEY_IP = "ip" JSON_KEY_IP_SRC = "ip.src" JSON_KEY_IP_DST = "ip.dst" +# Checked protocols JSON_KEY_UDP = "udp" JSON_KEY_TCP = "tcp" -JSON_KEY_MDNS = "mdns" -JSON_KEY_BOOTP = "bootp" -JSON_KEY_SSDP = "ssdp" -JSON_KEY_DHCPV6 = "dhcpv6" -JSON_KEY_LLMNR = "llmnr" +# List of checked protocols +listchkprot = [ "bootp", + "dhcpv6", + "dns", + "llmnr", + "mdns", + "ssdp" ] def parse_json(file_path): @@ -66,7 +69,7 @@ def parse_json(file_path): # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions. device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json" - + # Init empty graph G = nx.DiGraph() # Parse file again, this time constructing a graph of device<->server and device<->device communication. @@ -74,31 +77,19 @@ def parse_json(file_path): # Read JSON. # data becomes reference to root JSON object (or in our case json array) data = json.load(jf) + # Loop through json objects (packets) in data for p in data: # p is a JSON object, not an index # Drill down to object containing data from the different layers layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS] - # Skip all MDNS traffic. - if JSON_KEY_MDNS in layers: - continue - - # Skip all LLMNR traffic. - if JSON_KEY_LLMNR in layers: - continue - - # Skip all SSDP traffic - we don't care about disovery, only the actual communication. - if JSON_KEY_SSDP in layers: - continue - - # Skip all bootp traffic (DHCP related) - if JSON_KEY_BOOTP in layers: - continue - - # Skip DHCPv6 for now. - if JSON_KEY_DHCPV6 in layers: - continue + iscontinue = False + for prot in listchkprot: + if prot in layers: + iscontinue = True + if iscontinue: + continue # Skip any non udp/non tcp traffic if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers: @@ -118,22 +109,23 @@ def parse_json(file_path): ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC] ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST] - # ipre = re.compile(r'\b192.168.[0-9.]+') - # src_is_local = ipre.search(ip_src) - # dst_is_local = ipre.search(ip_dst) + # Categorize source and destination IP addresses: local vs. non-local + ipre = re.compile(r'\b192.168.[0-9.]+') + src_is_local = ipre.search(ip_src) + dst_is_local = ipre.search(ip_dst) print "ip.src =", ip_src, "ip.dst =", ip_dst - src_is_local = ip_src.startswith("192.168.") - dst_is_local = ip_dst.startswith("192.168.") src_node = None dst_node = None - if src_is_local: G.add_node(eth_src, Name=devlist[eth_src]) src_node = eth_src else: - # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device. - hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, packet_timestamp) + hostname = None + # Check first if the key (eth_dst) exists in the dictionary + if eth_dst in device_dns_mappings: + # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device. + hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, packet_timestamp) if hostname is None: # Use IP if no hostname mapping hostname = ip_src @@ -143,8 +135,11 @@ def parse_json(file_path): G.add_node(eth_dst, Name=devlist[eth_src]) dst_node = eth_dst else: - # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device. - hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, packet_timestamp) + hostname = None + # Check first if the key (eth_dst) exists in the dictionary + if eth_src in device_dns_mappings: + # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device. + hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, packet_timestamp) if hostname is None: # Use IP if no hostname mapping hostname = ip_dst @@ -152,58 +147,11 @@ def parse_json(file_path): dst_node = hostname G.add_edge(src_node, dst_node) -# # Traffic can be both outbound and inbound. -# # Determine which one of the two by looking up device MAC in DNS map. -# iot_device = None -# if eth_src in device_dns_mappings: -# iot_device = eth_src -# elif eth_dst in device_dns_mappings: -# iot_device = eth_dst -# else: -# # print "[ WARNING: DNS mapping not found for device with MAC", eth_src, "OR", eth_dst, "]" -# # This must be local communication between two IoT devices OR an IoT device talking to a hardcoded IP. -# # For now let's assume local communication. -# # Add a node for each device and an edge between them. -# G.add_node(eth_src, Name=devlist[eth_src]) -# G.add_node(eth_dst, Name=devlist[eth_src]) -# G.add_edge(eth_src, eth_dst) -# # TODO add regex check on src+dst IP to figure out if hardcoded server IP (e.g. check if one of the two are NOT a 192.168.x.y IP) -# continue -# # It is outbound traffic if iot_device matches src, otherwise it must be inbound traffic. -# outbound_traffic = iot_device == eth_src - - - -# ''' Graph construction ''' -# # No need to check if the Nodes and/or Edges we add already exist: -# # NetworkX won't add already existing nodes/edges (except in the case of a MultiGraph or MultiDiGraph (see NetworkX doc)). - -# # Add a node for each host. -# # First add node for IoT device. -# G.add_node(iot_device, Name=devlist[eth_src]) -# # Then add node for the server. -# # For this we need to distinguish between outbound and inbound traffic so that we look up the proper IP in our DNS map. -# # For outbound traffic, the server's IP is the destination IP. -# # For inbound traffic, the server's IP is the source IP. - -# server_ip = ip_dst if outbound_traffic else ip_src -# hostname = device_dns_mappings[iot_device].hostname_for_ip_at_time(server_ip, packet_timestamp) -# if hostname is None: -# # TODO this can occur when two local devices communicate OR if IoT device has hardcoded server IP. -# # However, we only get here for the DNS that have not performed any DNS lookups -# # We should use a regex check early in the loop to see if it is two local devices communicating. -# # This way we would not have to consider these corner cases later on. -# # print "[ WARNING: no ip-hostname mapping found for ip", server_ip, " -- adding eth.src->eth.dst edge, but note that this may be incorrect if IoT device has hardcoded server IP ]" -# G.add_node(eth_src, Name=devlist[eth_src]) -# G.add_node(eth_dst, Name=devlist[eth_src]) -# G.add_edge(eth_src, eth_dst) -# continue -# G.add_node(hostname) -# # Connect the two nodes we just added. -# if outbound_traffic: -# G.add_edge(iot_device, hostname) -# else: -# G.add_edge(hostname, iot_device) + # Print DNS mapping for reference + for mac in device_dns_mappings: + ddm = device_dns_mappings[mac] + ddm.print_mappings() + return G # ------------------------------------------------------ diff --git a/devicelist.dat b/devicelist.dat index 4f054fd..87eef4d 100644 --- a/devicelist.dat +++ b/devicelist.dat @@ -24,3 +24,4 @@ b0:b9:8a:73:69:8e, RouterPort_Bridge-LAN b0:b9:8a:73:69:8f, RouterPort_ETH1 b0:b9:8a:73:69:90, RouterPort_WLAN0 b0:b9:8a:73:69:91, RouterPort_WLAN1 +74:da:38:0d:05:55, RaspberryPi_Controller diff --git a/extract_from_tshark.py b/extract_from_tshark.py deleted file mode 100644 index 5704a97..0000000 --- a/extract_from_tshark.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/python - -""" -Script used to extract only the needed information from JSON packet traces generated by -tshark from PCAPNG format -""" - -import os, sys -import json -import uuid - -from collections import OrderedDict - -json_key_source = "_source" -json_key_layers = "layers" - -json_key_ip = "ip" -json_key_tcp = "tcp" - -json_key_http = "http" -json_key_method = "method" -json_key_uri = "uri" -json_key_headers = "headers" -json_key_host = "host" - -json_key_http_req = json_key_http + ".request." -json_key_http_req_method = json_key_http_req + json_key_method -json_key_http_req_uri = json_key_http_req + json_key_uri -json_key_http_req_line = json_key_http_req + "line" - -json_key_pkt_comment = "pkt_comment" - -json_key_frame = "frame" -json_key_frame_num = json_key_frame + ".number" -json_key_frame_comment = json_key_frame + ".comment" -json_key_frame_ts = json_key_frame + ".time_epoch" - - -JSON_KEY_ETH = "eth" -JSON_KEY_ETH_SRC = "eth.src" -JSON_KEY_ETH_DST = "eth.dst" - - -def make_unique(key, dct): - counter = 0 - unique_key = key - - while unique_key in dct: - counter += 1 - unique_key = '{}_{}'.format(key, counter) - return unique_key - - -def parse_object_pairs(pairs): - dct = OrderedDict() - for key, value in pairs: - if key in dct: - key = make_unique(key, dct) - dct[key] = value - - return dct - -def change_file(fpath): - for fn in os.listdir(fpath): - full_path = fpath + '/' + fn - - # Recursively go through all directories - if os.path.isdir(full_path): - change_file(full_path) - continue - - print full_path - with open(full_path, "r+") as jf: - # Since certain json 'keys' appear multiple times in our data, we have to make them - # unique first (we can't use regular json.load() or we lose some data points). From: - # https://stackoverflow.com/questions/29321677/python-json-parser-allow-duplicate-keys - decoder = json.JSONDecoder(object_pairs_hook=parse_object_pairs) - pcap_data = decoder.decode(jf.read()) - - # Prepare new data structure for re-formatted JSON storage - data = {} - for packet in pcap_data: - layers = packet[json_key_source][json_key_layers] - - # All captured traffic should have a frame + frame number, but check anyway - frame_num = " Frame: " - if json_key_frame not in layers or json_key_frame_num not in layers[json_key_frame]: - print "WARNING: could not find frame number! Using -1..." - frame_num = frame_num + "-1" - else: - # Save frame number for error-reporting - frame_num = frame_num + layers[json_key_frame][json_key_frame_num] - - # All captured traffic should be IP, but check anyway - if not json_key_ip in layers: - print "WARNING: Non-IP traffic detected!" + frame_num - continue - - # For now, focus on HTTP only - if json_key_tcp not in layers or json_key_http not in layers: - continue - - # Fill our new JSON packet with TCP/IP info - new_packet = {} - new_packet["dst_ip"] = layers[json_key_ip][json_key_ip + ".dst"] - new_packet["dst_port"] = int(layers[json_key_tcp][json_key_tcp + ".dstport"]) - - # JV: Also include src so we can see what device initiates the traffic - new_packet["src_ip"] = layers[json_key_ip][json_key_ip + ".src"] - new_packet["src_port"] = int(layers[json_key_tcp][json_key_tcp + ".srcport"]) - #JV: Also include eth soure/destination info so that we can map traffic to physical device using MAC - new_packet[JSON_KEY_ETH_SRC] = layers[JSON_KEY_ETH][JSON_KEY_ETH_SRC] - new_packet[JSON_KEY_ETH_DST] = layers[JSON_KEY_ETH][JSON_KEY_ETH_DST] - - # Go through all HTTP fields and extract the ones that are needed - http_data = layers[json_key_http] - for http_key in http_data: - http_value = http_data[http_key] - - if http_key.startswith(json_key_http_req_line): - header_line = http_value.split(":", 1) - if len(header_line) != 2: - print ("WARNING: could not parse header '" + str(header_line) + "'" - + frame_num) - continue - - # Prepare container for HTTP headers - if json_key_headers not in new_packet: - new_packet[json_key_headers] = {} - - # Use lower case for header keys to stay consistent with our other data - header_key = header_line[0].lower() - - # Remove the trailing carriage return - header_val = header_line[1].strip() - - # Save the header key-value pair - new_packet[json_key_headers][header_key] = header_val - - # If this is the host header, we also save it to the main object - if header_key == json_key_host: - new_packet[json_key_host] = header_val - - if json_key_http_req_method in http_value: - new_packet[json_key_method] = http_value[json_key_http_req_method] - if json_key_http_req_uri in http_value: - new_packet[json_key_uri] = http_value[json_key_http_req_uri] - - # End of HTTP parsing - - # Check that we found the minimum needed HTTP headers - if (json_key_uri not in new_packet or json_key_method not in new_packet or - json_key_host not in new_packet): - print "Missing some HTTP Headers!" + frame_num - continue - - # Extract timestamp - if json_key_frame_ts not in layers[json_key_frame]: - print "WARNING: could not find timestamp!" + frame_num - continue - - new_packet["ts"] = layers[json_key_frame][json_key_frame_ts] - - # Create a unique key for each packet to keep consistent with ReCon - # Also good in case packets end up in different files - data[str(uuid.uuid4())] = new_packet - - # Write the new data - #print json.dumps(data, sort_keys=True, indent=4) - jf.seek(0) - jf.write(json.dumps(data, sort_keys=True, indent=4)) - jf.truncate() - -if __name__ == '__main__': - # Needed to re-use some JSON keys - change_file(sys.argv[1]) \ No newline at end of file diff --git a/origin/extract_from_tshark.py b/origin/extract_from_tshark.py new file mode 100644 index 0000000..5704a97 --- /dev/null +++ b/origin/extract_from_tshark.py @@ -0,0 +1,176 @@ +#!/usr/bin/python + +""" +Script used to extract only the needed information from JSON packet traces generated by +tshark from PCAPNG format +""" + +import os, sys +import json +import uuid + +from collections import OrderedDict + +json_key_source = "_source" +json_key_layers = "layers" + +json_key_ip = "ip" +json_key_tcp = "tcp" + +json_key_http = "http" +json_key_method = "method" +json_key_uri = "uri" +json_key_headers = "headers" +json_key_host = "host" + +json_key_http_req = json_key_http + ".request." +json_key_http_req_method = json_key_http_req + json_key_method +json_key_http_req_uri = json_key_http_req + json_key_uri +json_key_http_req_line = json_key_http_req + "line" + +json_key_pkt_comment = "pkt_comment" + +json_key_frame = "frame" +json_key_frame_num = json_key_frame + ".number" +json_key_frame_comment = json_key_frame + ".comment" +json_key_frame_ts = json_key_frame + ".time_epoch" + + +JSON_KEY_ETH = "eth" +JSON_KEY_ETH_SRC = "eth.src" +JSON_KEY_ETH_DST = "eth.dst" + + +def make_unique(key, dct): + counter = 0 + unique_key = key + + while unique_key in dct: + counter += 1 + unique_key = '{}_{}'.format(key, counter) + return unique_key + + +def parse_object_pairs(pairs): + dct = OrderedDict() + for key, value in pairs: + if key in dct: + key = make_unique(key, dct) + dct[key] = value + + return dct + +def change_file(fpath): + for fn in os.listdir(fpath): + full_path = fpath + '/' + fn + + # Recursively go through all directories + if os.path.isdir(full_path): + change_file(full_path) + continue + + print full_path + with open(full_path, "r+") as jf: + # Since certain json 'keys' appear multiple times in our data, we have to make them + # unique first (we can't use regular json.load() or we lose some data points). From: + # https://stackoverflow.com/questions/29321677/python-json-parser-allow-duplicate-keys + decoder = json.JSONDecoder(object_pairs_hook=parse_object_pairs) + pcap_data = decoder.decode(jf.read()) + + # Prepare new data structure for re-formatted JSON storage + data = {} + for packet in pcap_data: + layers = packet[json_key_source][json_key_layers] + + # All captured traffic should have a frame + frame number, but check anyway + frame_num = " Frame: " + if json_key_frame not in layers or json_key_frame_num not in layers[json_key_frame]: + print "WARNING: could not find frame number! Using -1..." + frame_num = frame_num + "-1" + else: + # Save frame number for error-reporting + frame_num = frame_num + layers[json_key_frame][json_key_frame_num] + + # All captured traffic should be IP, but check anyway + if not json_key_ip in layers: + print "WARNING: Non-IP traffic detected!" + frame_num + continue + + # For now, focus on HTTP only + if json_key_tcp not in layers or json_key_http not in layers: + continue + + # Fill our new JSON packet with TCP/IP info + new_packet = {} + new_packet["dst_ip"] = layers[json_key_ip][json_key_ip + ".dst"] + new_packet["dst_port"] = int(layers[json_key_tcp][json_key_tcp + ".dstport"]) + + # JV: Also include src so we can see what device initiates the traffic + new_packet["src_ip"] = layers[json_key_ip][json_key_ip + ".src"] + new_packet["src_port"] = int(layers[json_key_tcp][json_key_tcp + ".srcport"]) + #JV: Also include eth soure/destination info so that we can map traffic to physical device using MAC + new_packet[JSON_KEY_ETH_SRC] = layers[JSON_KEY_ETH][JSON_KEY_ETH_SRC] + new_packet[JSON_KEY_ETH_DST] = layers[JSON_KEY_ETH][JSON_KEY_ETH_DST] + + # Go through all HTTP fields and extract the ones that are needed + http_data = layers[json_key_http] + for http_key in http_data: + http_value = http_data[http_key] + + if http_key.startswith(json_key_http_req_line): + header_line = http_value.split(":", 1) + if len(header_line) != 2: + print ("WARNING: could not parse header '" + str(header_line) + "'" + + frame_num) + continue + + # Prepare container for HTTP headers + if json_key_headers not in new_packet: + new_packet[json_key_headers] = {} + + # Use lower case for header keys to stay consistent with our other data + header_key = header_line[0].lower() + + # Remove the trailing carriage return + header_val = header_line[1].strip() + + # Save the header key-value pair + new_packet[json_key_headers][header_key] = header_val + + # If this is the host header, we also save it to the main object + if header_key == json_key_host: + new_packet[json_key_host] = header_val + + if json_key_http_req_method in http_value: + new_packet[json_key_method] = http_value[json_key_http_req_method] + if json_key_http_req_uri in http_value: + new_packet[json_key_uri] = http_value[json_key_http_req_uri] + + # End of HTTP parsing + + # Check that we found the minimum needed HTTP headers + if (json_key_uri not in new_packet or json_key_method not in new_packet or + json_key_host not in new_packet): + print "Missing some HTTP Headers!" + frame_num + continue + + # Extract timestamp + if json_key_frame_ts not in layers[json_key_frame]: + print "WARNING: could not find timestamp!" + frame_num + continue + + new_packet["ts"] = layers[json_key_frame][json_key_frame_ts] + + # Create a unique key for each packet to keep consistent with ReCon + # Also good in case packets end up in different files + data[str(uuid.uuid4())] = new_packet + + # Write the new data + #print json.dumps(data, sort_keys=True, indent=4) + jf.seek(0) + jf.write(json.dumps(data, sort_keys=True, indent=4)) + jf.truncate() + +if __name__ == '__main__': + # Needed to re-use some JSON keys + change_file(sys.argv[1]) \ No newline at end of file