from backports import zoneinfo import time import datetime import scapy_wrap import pandas as pd TIME_ZONE = zoneinfo.ZoneInfo('US/Eastern') TIME_PAD = datetime.timedelta(minutes=15) FLOAT_TOLERANCE = datetime.timedelta(seconds=0.01) def determineHttp(packet): """ Determine if a packet contains HTTP headers Parameters: ___________ packet: a Pcapreader packet object Returns: ________ 0 or 1 """ if (packet["TCP"].sport == 80) or (packet["TCP"].dport == 80): return 1 return 0 def determineTelnet(packet): """ Determine if a packet contains TELNET headers Parameters: ___________ packet: a Pcapreader packet object Returns: ________ 0 or 1 """ if (packet["TCP"].sport == 23) or (packet["TCP"].dport == 23): return 1 return 0 def formatFlags(flag): """ Process the flags for a given packet Parameters: __________ flag: string representing TCP flags (ex. PA) Returns: _______ array of binary features indicating presence of the 6 tcp flags """ flags = [0, 0, 0, 0, 0, 0] if "U" in flag: flags[0] = 1 if "A" in flag: flags[1] = 1 if "P" in flag: flags[2] = 1 if "R" in flag: flags[3] = 1 if "S" in flag: flags[4] /= 1 if "F" in flag: flags[5] = 1 return flags def clean_ipv4(ipv4_string): """ Removes leading zeros from an IPv4 address. params: ipv4_string - the IPv4 address (str) to remove leading zeros from return: a string representation of the IPv4 address with leading zeros removed in each octet """ return ".".join([parts if (len(parts:=octet.lstrip("0")))> 0 else "0" for octet in ipv4_string.split(".")]) def parse_tcp_outlist(tcp_out_file, time_zone, time_pad): """ Parses the tcpout.list file for a dataset. params: tcp_out_file - the file to parse time_zone - a tzinfo object of the current time zone time_pad - a datetime object to pad conversation start/ends with return: A 2D numpy array with the folling features - padded_start, - padded_end, - start (unpadded) - end (unpadded) - src_ip - src_port - dst_ip - dst_port - class """ with open(tcp_out_file, 'r') as infile: features = [] for line in infile: try: line = line.strip().split() # Skip the line if it doesn't match our use case (i.e. no ports) if line[5] == '-' or line[7] == '-': continue # Convert times to datetime objects in UTC start_time = datetime.datetime.strptime(f"{line[1]} {line[2]}", "%m/%d/%Y %H:%M:%S") start_time_utc = start_time - time_zone.utcoffset(start_time) start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc) time_delta = datetime.datetime.strptime(line[3], "%H:%M:%S") time_delta = datetime.timedelta( hours=time_delta.hour, minutes=time_delta.minute, seconds=time_delta.second ) end_time_utc = start_time_utc + time_delta line_features = [ start_time_utc - time_pad, # 0 end_time_utc + time_pad, # 1 start_time_utc, # 2 end_time_utc, # 3 clean_ipv4(line[7]), # src ip # 4 int(line[5]), # src port # 5 clean_ipv4(line[8]), # dst ip # 6 int(line[6]), # dst port # 7 line[10] # class # 8 ] if line_features[8] == "-": line_features[8] = "" features.append(line_features) except: pass return features def parse_tcp_stats(tcp_conv_file, time_zone): """ Parses TCP Conversation Stats file - the timezone of the capture (tzinfo) is required in order to generate proper timestamps. params: tcp_conv_file - a CSV of TCP conversation stats ( copied directly from wireshark GUI as CSV, ensure that "Absolute Start Time" and "Save Data as Raw" are checked ) pcap_file - the pcap file used to generate the stats returns: a feature vector of conversation stats in the following format - Address A - Port A - Address B - Port B - Packets - Bytes - Stream ID - Packets A → B - Bytes A → B - Packets B → A - Bytes B → A - UTC Start - UTC End """ with open(tcp_conv_file, 'r') as infile: features = [] for line in infile: line = line.strip().split(',') start_time = datetime.datetime.fromisoformat(line[13]) start_time_utc = start_time - time_zone.utcoffset(start_time) start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc) duration = datetime.timedelta(seconds=float(line[14])) end_time_utc = start_time_utc + duration feature_vector = [ line[0].strip('"'), # Address A int(line[1]), # Port A line[2].strip('"'), # Address B int(line[3]), # Port B int(line[4]), # Packets int(line[5]), # Bytes int(line[6]), # Stream ID int(line[9]), # Packets A → B int(line[10]), # Bytes A → B int(line[11]), # Packets B → A int(line[12]), # Bytes B → A start_time_utc, # UTC Start end_time_utc, # UTC End ] features.append(feature_vector) return features def create_feature_vectors(pcap_file, outlist_features, stats_features, time_zone): """ Creates feature vectors based on a pcap file and parsed outlist/statistics. Done during parsing as storing entire packets was space/time inefficient. params: pcap_file - the file to read outlist_features - features from the DARPA out list stats_features - stats from Wireshark TCP conversation stats return: a feature vector for each identifable packet in the following format - Source IP - Source Port - Destination IP - Destination Port - IP packet len - HTTP (boolean) - Telnet (boolean) - TCP - U - TCP - A - TCP - P - TCP - R - TCP - S - TCP - F - IP TTL - TCP Window - TCP Urgptr/ - UTC Timestamp - Total packets in convo - Packets SRC to DST in convo - Packets DST to SRC in convo - Bytes STC to DST in convo - Bytes DST to SRC in convo - Class label """ features = [] unidentified_packets = 0 for packet in scapy_wrap.PcapReader(pcap_file): if scapy_wrap.TCP not in packet: continue pkt_time = datetime.datetime.fromtimestamp(float(packet.time)) pkt_time = pkt_time - time_zone.utcoffset(pkt_time) pkt_time = pkt_time.replace(tzinfo=datetime.timezone.utc) p_features = [ packet["IP"].src, packet["TCP"].sport, packet["IP"].dst, packet["TCP"].dport, packet["IP"].len, 0, # Is http? 0, # Is telnet? 0, # TCP - U 0, # TCP - A 0, # TCP - P 0, # TCP - R 0, # TCP - S 0, # TCP - F packet["IP"].ttl, packet["TCP"].window, packet["TCP"].urgptr, pkt_time ] p_features[5] = determineHttp(packet) # set HTTP header presence if not p_features[5]: p_features[6] = determineTelnet(packet) # only look for TELNET if no HTTP p_features[7:13] = formatFlags(str(packet["TCP"].flags)) # Test if this packet can be matched to a conversation in the Wireshark Stats # Define lambda for matching s_invert_a_b = False stats_f_a = lambda conv: ( conv[0] == p_features[0] and # A-SIP conv[1] == p_features[1] and # A-SPORT conv[2] == p_features[2] and # B-DIP conv[3] == p_features[3] and # B-DPORT (conv[11] - FLOAT_TOLERANCE) <= p_features[16] and p_features[16] <= (conv[12] + FLOAT_TOLERANCE) ) stats_matches = list(filter(stats_f_a, stats_features)) if len(stats_matches) == 0: s_invert_a_b = True stats_f_b = lambda conv: ( conv[0] == p_features[2] and # A-DIP conv[1] == p_features[3] and # A-DPORT conv[2] == p_features[0] and # B-SIP conv[3] == p_features[1] and # B-SPORT (conv[11] - FLOAT_TOLERANCE) <= p_features[16] and p_features[16] <= (conv[12] + FLOAT_TOLERANCE) ) stats_matches = list(filter(stats_f_b, stats_features)) if len(stats_matches) > 1: print('more than one stats match identified') unidentified_packets += 1 continue elif len(stats_matches) == 0: print('zero stat matches identified') unidentified_packets += 1 continue stats_match = stats_matches[0] if not s_invert_a_b: # A is the SOURCE and B is the DESTINATION s_features = [ stats_match[4], # Total packets in convo stats_match[7], # Packets SRC to DST in convo stats_match[9], # Packets DST to SRC in convo stats_match[8], # Bytes STC to DST in convo stats_match[10] # Bytes DST to SRC in convo ] else: # A is the DESTINATION and B is the SOURCE s_features = [ stats_match[4], # Total packets in convo stats_match[9], # Packets SRC to DST in convo stats_match[7], # Packets DST to SRC in convo stats_match[10], # Bytes STC to DST in convo stats_match[8] # Bytes DST to SRC in convo ] # Test if this packet can be matched to a conversation in out.list file o_invert_s_d = False out_f_s_d = lambda conv: ( # outlist-pcap conv[4] == p_features[0] and # S-SIP conv[5] == p_features[1] and # S-SPORT conv[6] == p_features[2] and # B-DIP conv[7] == p_features[3] and # B-DPORT (conv[0] - FLOAT_TOLERANCE) <= p_features[16] and p_features[16] <= (conv[1] + FLOAT_TOLERANCE) ) out_matches = list(filter(out_f_s_d, outlist_features)) if len(out_matches) == 0: out_f_d_s = lambda conv: ( # outlist-pcap conv[6] == p_features[0] and # D-SIP conv[7] == p_features[1] and # D-SPORT conv[4] == p_features[2] and # S-DIP conv[5] == p_features[3] and # S-DPORT (conv[0] - FLOAT_TOLERANCE) <= p_features[16] and p_features[16] <= (conv[1] + FLOAT_TOLERANCE) ) out_matches = list(filter(out_f_d_s, outlist_features)) if len(out_matches) > 1: print('more than one out match identified') unidentified_packets += 1 continue elif len(out_matches) == 0: print('zero out matches identified') unidentified_packets += 1 continue out_class = out_matches[0][8] feat_vec = p_features + s_features + [out_class] features.append(feat_vec) print("unidentified/skipped packets:", unidentified_packets) return features if __name__ == "__main__": ts = time.time() outlist_features = parse_tcp_outlist("./tcpdump.list", TIME_ZONE, TIME_PAD) stats_features = parse_tcp_stats("./test.tcp.csv", TIME_ZONE) tcp_packets = create_feature_vectors("./tcpdump", outlist_features, stats_features, TIME_ZONE) te = time.time() df = pd.DataFrame(tcp_packets) df.to_csv("out.csv") print(te-ts)