393 lines
13 KiB
Python
Executable file
393 lines
13 KiB
Python
Executable file
from backports import zoneinfo
|
|
import time
|
|
import datetime
|
|
import scapy_wrap
|
|
import pandas as pd
|
|
|
|
TIME_ZONE = zoneinfo.ZoneInfo('US/Eastern')
|
|
TIME_PAD = datetime.timedelta(minutes=15)
|
|
FLOAT_TOLERANCE = datetime.timedelta(seconds=0.01)
|
|
|
|
|
|
def determineHttp(packet):
|
|
"""
|
|
Determine if a packet contains HTTP headers
|
|
|
|
Parameters:
|
|
___________
|
|
packet: a Pcapreader packet object
|
|
|
|
Returns:
|
|
________
|
|
0 or 1
|
|
"""
|
|
if (packet["TCP"].sport == 80) or (packet["TCP"].dport == 80):
|
|
return 1
|
|
return 0
|
|
|
|
|
|
def determineTelnet(packet):
|
|
"""
|
|
Determine if a packet contains TELNET headers
|
|
|
|
Parameters:
|
|
___________
|
|
packet: a Pcapreader packet object
|
|
|
|
Returns:
|
|
________
|
|
0 or 1
|
|
"""
|
|
if (packet["TCP"].sport == 23) or (packet["TCP"].dport == 23):
|
|
return 1
|
|
return 0
|
|
|
|
|
|
def formatFlags(flag):
|
|
"""
|
|
Process the flags for a given packet
|
|
|
|
Parameters:
|
|
__________
|
|
flag: string representing TCP flags (ex. PA)
|
|
|
|
Returns:
|
|
_______
|
|
array of binary features indicating presence of the 6 tcp flags
|
|
"""
|
|
flags = [0, 0, 0, 0, 0, 0]
|
|
if "U" in flag:
|
|
flags[0] = 1
|
|
if "A" in flag:
|
|
flags[1] = 1
|
|
if "P" in flag:
|
|
flags[2] = 1
|
|
if "R" in flag:
|
|
flags[3] = 1
|
|
if "S" in flag:
|
|
flags[4] /= 1
|
|
if "F" in flag:
|
|
flags[5] = 1
|
|
return flags
|
|
|
|
|
|
def clean_ipv4(ipv4_string):
|
|
"""
|
|
Removes leading zeros from an IPv4 address.
|
|
|
|
params:
|
|
ipv4_string - the IPv4 address (str) to remove leading zeros from
|
|
|
|
return:
|
|
a string representation of the IPv4 address with leading zeros removed in each octet
|
|
"""
|
|
return ".".join([parts if (len(parts:=octet.lstrip("0")))> 0 else "0" for octet in ipv4_string.split(".")])
|
|
|
|
|
|
def parse_tcp_outlist(tcp_out_file, time_zone, time_pad):
|
|
"""
|
|
Parses the tcpout.list file for a dataset.
|
|
|
|
params:
|
|
tcp_out_file - the file to parse
|
|
time_zone - a tzinfo object of the current time zone
|
|
time_pad - a datetime object to pad conversation start/ends with
|
|
|
|
return:
|
|
A 2D numpy array with the folling features
|
|
- padded_start,
|
|
- padded_end,
|
|
- start (unpadded)
|
|
- end (unpadded)
|
|
- src_ip
|
|
- src_port
|
|
- dst_ip
|
|
- dst_port
|
|
- class
|
|
"""
|
|
with open(tcp_out_file, 'r') as infile:
|
|
features = []
|
|
for line in infile:
|
|
try:
|
|
line = line.strip().split()
|
|
|
|
# Skip the line if it doesn't match our use case (i.e. no ports)
|
|
if line[5] == '-' or line[7] == '-':
|
|
continue
|
|
|
|
# Convert times to datetime objects in UTC
|
|
start_time = datetime.datetime.strptime(f"{line[1]} {line[2]}", "%m/%d/%Y %H:%M:%S")
|
|
start_time_utc = start_time - time_zone.utcoffset(start_time)
|
|
start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
|
|
time_delta = datetime.datetime.strptime(line[3], "%H:%M:%S")
|
|
time_delta = datetime.timedelta(
|
|
hours=time_delta.hour,
|
|
minutes=time_delta.minute,
|
|
seconds=time_delta.second
|
|
)
|
|
end_time_utc = start_time_utc + time_delta
|
|
|
|
line_features = [
|
|
start_time_utc - time_pad, # 0
|
|
end_time_utc + time_pad, # 1
|
|
start_time_utc, # 2
|
|
end_time_utc, # 3
|
|
clean_ipv4(line[7]), # src ip # 4
|
|
int(line[5]), # src port # 5
|
|
clean_ipv4(line[8]), # dst ip # 6
|
|
int(line[6]), # dst port # 7
|
|
line[10] # class # 8
|
|
]
|
|
|
|
if line_features[8] == "-":
|
|
line_features[8] = ""
|
|
|
|
features.append(line_features)
|
|
except:
|
|
pass
|
|
return features
|
|
|
|
|
|
def parse_tcp_stats(tcp_conv_file, time_zone):
|
|
"""
|
|
Parses TCP Conversation Stats file - the timezone of the capture (tzinfo) is required
|
|
in order to generate proper timestamps.
|
|
|
|
params:
|
|
tcp_conv_file - a CSV of TCP conversation stats (
|
|
copied directly from wireshark GUI as CSV,
|
|
ensure that "Absolute Start Time" and
|
|
"Save Data as Raw" are checked
|
|
)
|
|
pcap_file - the pcap file used to generate the stats
|
|
|
|
returns:
|
|
a feature vector of conversation stats in the following format
|
|
- Address A
|
|
- Port A
|
|
- Address B
|
|
- Port B
|
|
- Packets
|
|
- Bytes
|
|
- Stream ID
|
|
- Packets A → B
|
|
- Bytes A → B
|
|
- Packets B → A
|
|
- Bytes B → A
|
|
- UTC Start
|
|
- UTC End
|
|
"""
|
|
with open(tcp_conv_file, 'r') as infile:
|
|
features = []
|
|
for line in infile:
|
|
line = line.strip().split(',')
|
|
|
|
start_time = datetime.datetime.fromisoformat(line[13])
|
|
start_time_utc = start_time - time_zone.utcoffset(start_time)
|
|
start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
|
|
duration = datetime.timedelta(seconds=float(line[14]))
|
|
end_time_utc = start_time_utc + duration
|
|
|
|
feature_vector = [
|
|
line[0].strip('"'), # Address A
|
|
int(line[1]), # Port A
|
|
line[2].strip('"'), # Address B
|
|
int(line[3]), # Port B
|
|
int(line[4]), # Packets
|
|
int(line[5]), # Bytes
|
|
int(line[6]), # Stream ID
|
|
int(line[9]), # Packets A → B
|
|
int(line[10]), # Bytes A → B
|
|
int(line[11]), # Packets B → A
|
|
int(line[12]), # Bytes B → A
|
|
start_time_utc, # UTC Start
|
|
end_time_utc, # UTC End
|
|
]
|
|
|
|
features.append(feature_vector)
|
|
return features
|
|
|
|
|
|
def create_feature_vectors(pcap_file, outlist_features, stats_features, time_zone):
|
|
"""
|
|
Creates feature vectors based on a pcap file and parsed outlist/statistics.
|
|
Done during parsing as storing entire packets was space/time inefficient.
|
|
|
|
params:
|
|
pcap_file - the file to read
|
|
outlist_features - features from the DARPA out list
|
|
stats_features - stats from Wireshark TCP conversation stats
|
|
|
|
return:
|
|
a feature vector for each identifable packet in the following format
|
|
- Source IP
|
|
- Source Port
|
|
- Destination IP
|
|
- Destination Port
|
|
- IP packet len
|
|
- HTTP (boolean)
|
|
- Telnet (boolean)
|
|
- TCP - U
|
|
- TCP - A
|
|
- TCP - P
|
|
- TCP - R
|
|
- TCP - S
|
|
- TCP - F
|
|
- IP TTL
|
|
- TCP Window
|
|
- TCP Urgptr/
|
|
- UTC Timestamp
|
|
- Total packets in convo
|
|
- Packets SRC to DST in convo
|
|
- Packets DST to SRC in convo
|
|
- Bytes STC to DST in convo
|
|
- Bytes DST to SRC in convo
|
|
- Class label
|
|
"""
|
|
|
|
features = []
|
|
unidentified_packets = 0
|
|
|
|
for packet in scapy_wrap.PcapReader(pcap_file):
|
|
|
|
if scapy_wrap.TCP not in packet:
|
|
continue
|
|
|
|
pkt_time = datetime.datetime.fromtimestamp(float(packet.time))
|
|
pkt_time = pkt_time - time_zone.utcoffset(pkt_time)
|
|
pkt_time = pkt_time.replace(tzinfo=datetime.timezone.utc)
|
|
|
|
p_features = [
|
|
packet["IP"].src,
|
|
packet["TCP"].sport,
|
|
packet["IP"].dst,
|
|
packet["TCP"].dport,
|
|
packet["IP"].len,
|
|
0, # Is http?
|
|
0, # Is telnet?
|
|
0, # TCP - U
|
|
0, # TCP - A
|
|
0, # TCP - P
|
|
0, # TCP - R
|
|
0, # TCP - S
|
|
0, # TCP - F
|
|
packet["IP"].ttl,
|
|
packet["TCP"].window,
|
|
packet["TCP"].urgptr,
|
|
pkt_time
|
|
]
|
|
|
|
p_features[5] = determineHttp(packet) # set HTTP header presence
|
|
if not p_features[5]:
|
|
p_features[6] = determineTelnet(packet) # only look for TELNET if no HTTP
|
|
|
|
p_features[7:13] = formatFlags(str(packet["TCP"].flags))
|
|
|
|
# Test if this packet can be matched to a conversation in the Wireshark Stats
|
|
# Define lambda for matching
|
|
s_invert_a_b = False
|
|
stats_f_a = lambda conv: (
|
|
conv[0] == p_features[0] and # A-SIP
|
|
conv[1] == p_features[1] and # A-SPORT
|
|
conv[2] == p_features[2] and # B-DIP
|
|
conv[3] == p_features[3] and # B-DPORT
|
|
(conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
|
|
p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
|
|
)
|
|
|
|
stats_matches = list(filter(stats_f_a, stats_features))
|
|
|
|
if len(stats_matches) == 0:
|
|
s_invert_a_b = True
|
|
stats_f_b = lambda conv: (
|
|
conv[0] == p_features[2] and # A-DIP
|
|
conv[1] == p_features[3] and # A-DPORT
|
|
conv[2] == p_features[0] and # B-SIP
|
|
conv[3] == p_features[1] and # B-SPORT
|
|
(conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
|
|
p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
|
|
)
|
|
|
|
stats_matches = list(filter(stats_f_b, stats_features))
|
|
|
|
if len(stats_matches) > 1:
|
|
print('more than one stats match identified')
|
|
unidentified_packets += 1
|
|
continue
|
|
elif len(stats_matches) == 0:
|
|
print('zero stat matches identified')
|
|
unidentified_packets += 1
|
|
continue
|
|
|
|
stats_match = stats_matches[0]
|
|
|
|
if not s_invert_a_b: # A is the SOURCE and B is the DESTINATION
|
|
s_features = [
|
|
stats_match[4], # Total packets in convo
|
|
stats_match[7], # Packets SRC to DST in convo
|
|
stats_match[9], # Packets DST to SRC in convo
|
|
stats_match[8], # Bytes STC to DST in convo
|
|
stats_match[10] # Bytes DST to SRC in convo
|
|
]
|
|
else: # A is the DESTINATION and B is the SOURCE
|
|
s_features = [
|
|
stats_match[4], # Total packets in convo
|
|
stats_match[9], # Packets SRC to DST in convo
|
|
stats_match[7], # Packets DST to SRC in convo
|
|
stats_match[10], # Bytes STC to DST in convo
|
|
stats_match[8] # Bytes DST to SRC in convo
|
|
]
|
|
|
|
|
|
# Test if this packet can be matched to a conversation in out.list file
|
|
o_invert_s_d = False
|
|
out_f_s_d = lambda conv: ( # outlist-pcap
|
|
conv[4] == p_features[0] and # S-SIP
|
|
conv[5] == p_features[1] and # S-SPORT
|
|
conv[6] == p_features[2] and # B-DIP
|
|
conv[7] == p_features[3] and # B-DPORT
|
|
(conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
|
|
p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
|
|
)
|
|
|
|
out_matches = list(filter(out_f_s_d, outlist_features))
|
|
|
|
if len(out_matches) == 0:
|
|
out_f_d_s = lambda conv: ( # outlist-pcap
|
|
conv[6] == p_features[0] and # D-SIP
|
|
conv[7] == p_features[1] and # D-SPORT
|
|
conv[4] == p_features[2] and # S-DIP
|
|
conv[5] == p_features[3] and # S-DPORT
|
|
(conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
|
|
p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
|
|
)
|
|
|
|
out_matches = list(filter(out_f_d_s, outlist_features))
|
|
|
|
if len(out_matches) > 1:
|
|
print('more than one out match identified')
|
|
unidentified_packets += 1
|
|
continue
|
|
elif len(out_matches) == 0:
|
|
print('zero out matches identified')
|
|
unidentified_packets += 1
|
|
continue
|
|
|
|
out_class = out_matches[0][8]
|
|
|
|
feat_vec = p_features + s_features + [out_class]
|
|
features.append(feat_vec)
|
|
|
|
print("unidentified/skipped packets:", unidentified_packets)
|
|
return features
|
|
|
|
if __name__ == "__main__":
|
|
ts = time.time()
|
|
outlist_features = parse_tcp_outlist("./tcpdump.list", TIME_ZONE, TIME_PAD)
|
|
stats_features = parse_tcp_stats("./test.tcp.csv", TIME_ZONE)
|
|
tcp_packets = create_feature_vectors("./tcpdump", outlist_features, stats_features, TIME_ZONE)
|
|
te = time.time()
|
|
df = pd.DataFrame(tcp_packets)
|
|
df.to_csv("out.csv")
|
|
print(te-ts)
|