4 Script used to extract only the needed information from JSON packet traces generated by
\r
5 tshark from PCAPNG format
\r
12 from collections import OrderedDict
\r
14 json_key_source = "_source"
\r
15 json_key_layers = "layers"
\r
18 json_key_tcp = "tcp"
\r
20 json_key_http = "http"
\r
21 json_key_method = "method"
\r
22 json_key_uri = "uri"
\r
23 json_key_headers = "headers"
\r
24 json_key_host = "host"
\r
26 json_key_http_req = json_key_http + ".request."
\r
27 json_key_http_req_method = json_key_http_req + json_key_method
\r
28 json_key_http_req_uri = json_key_http_req + json_key_uri
\r
29 json_key_http_req_line = json_key_http_req + "line"
\r
31 json_key_pkt_comment = "pkt_comment"
\r
33 json_key_frame = "frame"
\r
34 json_key_frame_num = json_key_frame + ".number"
\r
35 json_key_frame_comment = json_key_frame + ".comment"
\r
36 json_key_frame_ts = json_key_frame + ".time_epoch"
\r
39 JSON_KEY_ETH = "eth"
\r
40 JSON_KEY_ETH_SRC = "eth.src"
\r
41 JSON_KEY_ETH_DST = "eth.dst"
\r
44 def make_unique(key, dct):
\r
48 while unique_key in dct:
\r
50 unique_key = '{}_{}'.format(key, counter)
\r
54 def parse_object_pairs(pairs):
\r
56 for key, value in pairs:
\r
58 key = make_unique(key, dct)
\r
63 def change_file(fpath):
\r
64 for fn in os.listdir(fpath):
\r
65 full_path = fpath + '/' + fn
\r
67 # Recursively go through all directories
\r
68 if os.path.isdir(full_path):
\r
69 change_file(full_path)
\r
73 with open(full_path, "r+") as jf:
\r
74 # Since certain json 'keys' appear multiple times in our data, we have to make them
\r
75 # unique first (we can't use regular json.load() or we lose some data points). From:
\r
76 # https://stackoverflow.com/questions/29321677/python-json-parser-allow-duplicate-keys
\r
77 decoder = json.JSONDecoder(object_pairs_hook=parse_object_pairs)
\r
78 pcap_data = decoder.decode(jf.read())
\r
80 # Prepare new data structure for re-formatted JSON storage
\r
82 for packet in pcap_data:
\r
83 layers = packet[json_key_source][json_key_layers]
\r
85 # All captured traffic should have a frame + frame number, but check anyway
\r
86 frame_num = " Frame: "
\r
87 if json_key_frame not in layers or json_key_frame_num not in layers[json_key_frame]:
\r
88 print "WARNING: could not find frame number! Using -1..."
\r
89 frame_num = frame_num + "-1"
\r
91 # Save frame number for error-reporting
\r
92 frame_num = frame_num + layers[json_key_frame][json_key_frame_num]
\r
94 # All captured traffic should be IP, but check anyway
\r
95 if not json_key_ip in layers:
\r
96 print "WARNING: Non-IP traffic detected!" + frame_num
\r
99 # For now, focus on HTTP only
\r
100 if json_key_tcp not in layers or json_key_http not in layers:
\r
103 # Fill our new JSON packet with TCP/IP info
\r
105 new_packet["dst_ip"] = layers[json_key_ip][json_key_ip + ".dst"]
\r
106 new_packet["dst_port"] = int(layers[json_key_tcp][json_key_tcp + ".dstport"])
\r
108 # JV: Also include src so we can see what device initiates the traffic
\r
109 new_packet["src_ip"] = layers[json_key_ip][json_key_ip + ".src"]
\r
110 new_packet["src_port"] = int(layers[json_key_tcp][json_key_tcp + ".srcport"])
\r
111 #JV: Also include eth soure/destination info so that we can map traffic to physical device using MAC
\r
112 new_packet[JSON_KEY_ETH_SRC] = layers[JSON_KEY_ETH][JSON_KEY_ETH_SRC]
\r
113 new_packet[JSON_KEY_ETH_DST] = layers[JSON_KEY_ETH][JSON_KEY_ETH_DST]
\r
115 # Go through all HTTP fields and extract the ones that are needed
\r
116 http_data = layers[json_key_http]
\r
117 for http_key in http_data:
\r
118 http_value = http_data[http_key]
\r
120 if http_key.startswith(json_key_http_req_line):
\r
121 header_line = http_value.split(":", 1)
\r
122 if len(header_line) != 2:
\r
123 print ("WARNING: could not parse header '" + str(header_line) + "'"
\r
127 # Prepare container for HTTP headers
\r
128 if json_key_headers not in new_packet:
\r
129 new_packet[json_key_headers] = {}
\r
131 # Use lower case for header keys to stay consistent with our other data
\r
132 header_key = header_line[0].lower()
\r
134 # Remove the trailing carriage return
\r
135 header_val = header_line[1].strip()
\r
137 # Save the header key-value pair
\r
138 new_packet[json_key_headers][header_key] = header_val
\r
140 # If this is the host header, we also save it to the main object
\r
141 if header_key == json_key_host:
\r
142 new_packet[json_key_host] = header_val
\r
144 if json_key_http_req_method in http_value:
\r
145 new_packet[json_key_method] = http_value[json_key_http_req_method]
\r
146 if json_key_http_req_uri in http_value:
\r
147 new_packet[json_key_uri] = http_value[json_key_http_req_uri]
\r
149 # End of HTTP parsing
\r
151 # Check that we found the minimum needed HTTP headers
\r
152 if (json_key_uri not in new_packet or json_key_method not in new_packet or
\r
153 json_key_host not in new_packet):
\r
154 print "Missing some HTTP Headers!" + frame_num
\r
157 # Extract timestamp
\r
158 if json_key_frame_ts not in layers[json_key_frame]:
\r
159 print "WARNING: could not find timestamp!" + frame_num
\r
162 new_packet["ts"] = layers[json_key_frame][json_key_frame_ts]
\r
164 # Create a unique key for each packet to keep consistent with ReCon
\r
165 # Also good in case packets end up in different files
\r
166 data[str(uuid.uuid4())] = new_packet
\r
168 # Write the new data
\r
169 #print json.dumps(data, sort_keys=True, indent=4)
\r
171 jf.write(json.dumps(data, sort_keys=True, indent=4))
\r
174 if __name__ == '__main__':
\r
175 # Needed to re-use some JSON keys
\r
176 change_file(sys.argv[1])