393 lines
13 KiB
Python
393 lines
13 KiB
Python
|
from backports import zoneinfo
|
|||
|
import time
|
|||
|
import datetime
|
|||
|
import scapy_wrap
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
TIME_ZONE = zoneinfo.ZoneInfo('US/Eastern')
|
|||
|
TIME_PAD = datetime.timedelta(minutes=15)
|
|||
|
FLOAT_TOLERANCE = datetime.timedelta(seconds=0.01)
|
|||
|
|
|||
|
|
|||
|
def determineHttp(packet):
|
|||
|
"""
|
|||
|
Determine if a packet contains HTTP headers
|
|||
|
|
|||
|
Parameters:
|
|||
|
___________
|
|||
|
packet: a Pcapreader packet object
|
|||
|
|
|||
|
Returns:
|
|||
|
________
|
|||
|
0 or 1
|
|||
|
"""
|
|||
|
if (packet["TCP"].sport == 80) or (packet["TCP"].dport == 80):
|
|||
|
return 1
|
|||
|
return 0
|
|||
|
|
|||
|
|
|||
|
def determineTelnet(packet):
|
|||
|
"""
|
|||
|
Determine if a packet contains TELNET headers
|
|||
|
|
|||
|
Parameters:
|
|||
|
___________
|
|||
|
packet: a Pcapreader packet object
|
|||
|
|
|||
|
Returns:
|
|||
|
________
|
|||
|
0 or 1
|
|||
|
"""
|
|||
|
if (packet["TCP"].sport == 23) or (packet["TCP"].dport == 23):
|
|||
|
return 1
|
|||
|
return 0
|
|||
|
|
|||
|
|
|||
|
def formatFlags(flag):
|
|||
|
"""
|
|||
|
Process the flags for a given packet
|
|||
|
|
|||
|
Parameters:
|
|||
|
__________
|
|||
|
flag: string representing TCP flags (ex. PA)
|
|||
|
|
|||
|
Returns:
|
|||
|
_______
|
|||
|
array of binary features indicating presence of the 6 tcp flags
|
|||
|
"""
|
|||
|
flags = [0, 0, 0, 0, 0, 0]
|
|||
|
if "U" in flag:
|
|||
|
flags[0] = 1
|
|||
|
if "A" in flag:
|
|||
|
flags[1] = 1
|
|||
|
if "P" in flag:
|
|||
|
flags[2] = 1
|
|||
|
if "R" in flag:
|
|||
|
flags[3] = 1
|
|||
|
if "S" in flag:
|
|||
|
flags[4] /= 1
|
|||
|
if "F" in flag:
|
|||
|
flags[5] = 1
|
|||
|
return flags
|
|||
|
|
|||
|
|
|||
|
def clean_ipv4(ipv4_string):
|
|||
|
"""
|
|||
|
Removes leading zeros from an IPv4 address.
|
|||
|
|
|||
|
params:
|
|||
|
ipv4_string - the IPv4 address (str) to remove leading zeros from
|
|||
|
|
|||
|
return:
|
|||
|
a string representation of the IPv4 address with leading zeros removed in each octet
|
|||
|
"""
|
|||
|
return ".".join([parts if (len(parts:=octet.lstrip("0")))> 0 else "0" for octet in ipv4_string.split(".")])
|
|||
|
|
|||
|
|
|||
|
def parse_tcp_outlist(tcp_out_file, time_zone, time_pad):
|
|||
|
"""
|
|||
|
Parses the tcpout.list file for a dataset.
|
|||
|
|
|||
|
params:
|
|||
|
tcp_out_file - the file to parse
|
|||
|
time_zone - a tzinfo object of the current time zone
|
|||
|
time_pad - a datetime object to pad conversation start/ends with
|
|||
|
|
|||
|
return:
|
|||
|
A 2D numpy array with the folling features
|
|||
|
- padded_start,
|
|||
|
- padded_end,
|
|||
|
- start (unpadded)
|
|||
|
- end (unpadded)
|
|||
|
- src_ip
|
|||
|
- src_port
|
|||
|
- dst_ip
|
|||
|
- dst_port
|
|||
|
- class
|
|||
|
"""
|
|||
|
with open(tcp_out_file, 'r') as infile:
|
|||
|
features = []
|
|||
|
for line in infile:
|
|||
|
try:
|
|||
|
line = line.strip().split()
|
|||
|
|
|||
|
# Skip the line if it doesn't match our use case (i.e. no ports)
|
|||
|
if line[5] == '-' or line[7] == '-':
|
|||
|
continue
|
|||
|
|
|||
|
# Convert times to datetime objects in UTC
|
|||
|
start_time = datetime.datetime.strptime(f"{line[1]} {line[2]}", "%m/%d/%Y %H:%M:%S")
|
|||
|
start_time_utc = start_time - time_zone.utcoffset(start_time)
|
|||
|
start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
|
|||
|
time_delta = datetime.datetime.strptime(line[3], "%H:%M:%S")
|
|||
|
time_delta = datetime.timedelta(
|
|||
|
hours=time_delta.hour,
|
|||
|
minutes=time_delta.minute,
|
|||
|
seconds=time_delta.second
|
|||
|
)
|
|||
|
end_time_utc = start_time_utc + time_delta
|
|||
|
|
|||
|
line_features = [
|
|||
|
start_time_utc - time_pad, # 0
|
|||
|
end_time_utc + time_pad, # 1
|
|||
|
start_time_utc, # 2
|
|||
|
end_time_utc, # 3
|
|||
|
clean_ipv4(line[7]), # src ip # 4
|
|||
|
int(line[5]), # src port # 5
|
|||
|
clean_ipv4(line[8]), # dst ip # 6
|
|||
|
int(line[6]), # dst port # 7
|
|||
|
line[10] # class # 8
|
|||
|
]
|
|||
|
|
|||
|
if line_features[8] == "-":
|
|||
|
line_features[8] = ""
|
|||
|
|
|||
|
features.append(line_features)
|
|||
|
except:
|
|||
|
pass
|
|||
|
return features
|
|||
|
|
|||
|
|
|||
|
def parse_tcp_stats(tcp_conv_file, time_zone):
|
|||
|
"""
|
|||
|
Parses TCP Conversation Stats file - the timezone of the capture (tzinfo) is required
|
|||
|
in order to generate proper timestamps.
|
|||
|
|
|||
|
params:
|
|||
|
tcp_conv_file - a CSV of TCP conversation stats (
|
|||
|
copied directly from wireshark GUI as CSV,
|
|||
|
ensure that "Absolute Start Time" and
|
|||
|
"Save Data as Raw" are checked
|
|||
|
)
|
|||
|
pcap_file - the pcap file used to generate the stats
|
|||
|
|
|||
|
returns:
|
|||
|
a feature vector of conversation stats in the following format
|
|||
|
- Address A
|
|||
|
- Port A
|
|||
|
- Address B
|
|||
|
- Port B
|
|||
|
- Packets
|
|||
|
- Bytes
|
|||
|
- Stream ID
|
|||
|
- Packets A → B
|
|||
|
- Bytes A → B
|
|||
|
- Packets B → A
|
|||
|
- Bytes B → A
|
|||
|
- UTC Start
|
|||
|
- UTC End
|
|||
|
"""
|
|||
|
with open(tcp_conv_file, 'r') as infile:
|
|||
|
features = []
|
|||
|
for line in infile:
|
|||
|
line = line.strip().split(',')
|
|||
|
|
|||
|
start_time = datetime.datetime.fromisoformat(line[13])
|
|||
|
start_time_utc = start_time - time_zone.utcoffset(start_time)
|
|||
|
start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
|
|||
|
duration = datetime.timedelta(seconds=float(line[14]))
|
|||
|
end_time_utc = start_time_utc + duration
|
|||
|
|
|||
|
feature_vector = [
|
|||
|
line[0].strip('"'), # Address A
|
|||
|
int(line[1]), # Port A
|
|||
|
line[2].strip('"'), # Address B
|
|||
|
int(line[3]), # Port B
|
|||
|
int(line[4]), # Packets
|
|||
|
int(line[5]), # Bytes
|
|||
|
int(line[6]), # Stream ID
|
|||
|
int(line[9]), # Packets A → B
|
|||
|
int(line[10]), # Bytes A → B
|
|||
|
int(line[11]), # Packets B → A
|
|||
|
int(line[12]), # Bytes B → A
|
|||
|
start_time_utc, # UTC Start
|
|||
|
end_time_utc, # UTC End
|
|||
|
]
|
|||
|
|
|||
|
features.append(feature_vector)
|
|||
|
return features
|
|||
|
|
|||
|
|
|||
|
def create_feature_vectors(pcap_file, outlist_features, stats_features, time_zone):
|
|||
|
"""
|
|||
|
Creates feature vectors based on a pcap file and parsed outlist/statistics.
|
|||
|
Done during parsing as storing entire packets was space/time inefficient.
|
|||
|
|
|||
|
params:
|
|||
|
pcap_file - the file to read
|
|||
|
outlist_features - features from the DARPA out list
|
|||
|
stats_features - stats from Wireshark TCP conversation stats
|
|||
|
|
|||
|
return:
|
|||
|
a feature vector for each identifable packet in the following format
|
|||
|
- Source IP
|
|||
|
- Source Port
|
|||
|
- Destination IP
|
|||
|
- Destination Port
|
|||
|
- IP packet len
|
|||
|
- HTTP (boolean)
|
|||
|
- Telnet (boolean)
|
|||
|
- TCP - U
|
|||
|
- TCP - A
|
|||
|
- TCP - P
|
|||
|
- TCP - R
|
|||
|
- TCP - S
|
|||
|
- TCP - F
|
|||
|
- IP TTL
|
|||
|
- TCP Window
|
|||
|
- TCP Urgptr/
|
|||
|
- UTC Timestamp
|
|||
|
- Total packets in convo
|
|||
|
- Packets SRC to DST in convo
|
|||
|
- Packets DST to SRC in convo
|
|||
|
- Bytes STC to DST in convo
|
|||
|
- Bytes DST to SRC in convo
|
|||
|
- Class label
|
|||
|
"""
|
|||
|
|
|||
|
features = []
|
|||
|
unidentified_packets = 0
|
|||
|
|
|||
|
for packet in scapy_wrap.PcapReader(pcap_file):
|
|||
|
|
|||
|
if scapy_wrap.TCP not in packet:
|
|||
|
continue
|
|||
|
|
|||
|
pkt_time = datetime.datetime.fromtimestamp(float(packet.time))
|
|||
|
pkt_time = pkt_time - time_zone.utcoffset(pkt_time)
|
|||
|
pkt_time = pkt_time.replace(tzinfo=datetime.timezone.utc)
|
|||
|
|
|||
|
p_features = [
|
|||
|
packet["IP"].src,
|
|||
|
packet["TCP"].sport,
|
|||
|
packet["IP"].dst,
|
|||
|
packet["TCP"].dport,
|
|||
|
packet["IP"].len,
|
|||
|
0, # Is http?
|
|||
|
0, # Is telnet?
|
|||
|
0, # TCP - U
|
|||
|
0, # TCP - A
|
|||
|
0, # TCP - P
|
|||
|
0, # TCP - R
|
|||
|
0, # TCP - S
|
|||
|
0, # TCP - F
|
|||
|
packet["IP"].ttl,
|
|||
|
packet["TCP"].window,
|
|||
|
packet["TCP"].urgptr,
|
|||
|
pkt_time
|
|||
|
]
|
|||
|
|
|||
|
p_features[5] = determineHttp(packet) # set HTTP header presence
|
|||
|
if not p_features[5]:
|
|||
|
p_features[6] = determineTelnet(packet) # only look for TELNET if no HTTP
|
|||
|
|
|||
|
p_features[7:13] = formatFlags(str(packet["TCP"].flags))
|
|||
|
|
|||
|
# Test if this packet can be matched to a conversation in the Wireshark Stats
|
|||
|
# Define lambda for matching
|
|||
|
s_invert_a_b = False
|
|||
|
stats_f_a = lambda conv: (
|
|||
|
conv[0] == p_features[0] and # A-SIP
|
|||
|
conv[1] == p_features[1] and # A-SPORT
|
|||
|
conv[2] == p_features[2] and # B-DIP
|
|||
|
conv[3] == p_features[3] and # B-DPORT
|
|||
|
(conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
|
|||
|
p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
|
|||
|
)
|
|||
|
|
|||
|
stats_matches = list(filter(stats_f_a, stats_features))
|
|||
|
|
|||
|
if len(stats_matches) == 0:
|
|||
|
s_invert_a_b = True
|
|||
|
stats_f_b = lambda conv: (
|
|||
|
conv[0] == p_features[2] and # A-DIP
|
|||
|
conv[1] == p_features[3] and # A-DPORT
|
|||
|
conv[2] == p_features[0] and # B-SIP
|
|||
|
conv[3] == p_features[1] and # B-SPORT
|
|||
|
(conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
|
|||
|
p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
|
|||
|
)
|
|||
|
|
|||
|
stats_matches = list(filter(stats_f_b, stats_features))
|
|||
|
|
|||
|
if len(stats_matches) > 1:
|
|||
|
print('more than one stats match identified')
|
|||
|
unidentified_packets += 1
|
|||
|
continue
|
|||
|
elif len(stats_matches) == 0:
|
|||
|
print('zero stat matches identified')
|
|||
|
unidentified_packets += 1
|
|||
|
continue
|
|||
|
|
|||
|
stats_match = stats_matches[0]
|
|||
|
|
|||
|
if not s_invert_a_b: # A is the SOURCE and B is the DESTINATION
|
|||
|
s_features = [
|
|||
|
stats_match[4], # Total packets in convo
|
|||
|
stats_match[7], # Packets SRC to DST in convo
|
|||
|
stats_match[9], # Packets DST to SRC in convo
|
|||
|
stats_match[8], # Bytes STC to DST in convo
|
|||
|
stats_match[10] # Bytes DST to SRC in convo
|
|||
|
]
|
|||
|
else: # A is the DESTINATION and B is the SOURCE
|
|||
|
s_features = [
|
|||
|
stats_match[4], # Total packets in convo
|
|||
|
stats_match[9], # Packets SRC to DST in convo
|
|||
|
stats_match[7], # Packets DST to SRC in convo
|
|||
|
stats_match[10], # Bytes STC to DST in convo
|
|||
|
stats_match[8] # Bytes DST to SRC in convo
|
|||
|
]
|
|||
|
|
|||
|
|
|||
|
# Test if this packet can be matched to a conversation in out.list file
|
|||
|
o_invert_s_d = False
|
|||
|
out_f_s_d = lambda conv: ( # outlist-pcap
|
|||
|
conv[4] == p_features[0] and # S-SIP
|
|||
|
conv[5] == p_features[1] and # S-SPORT
|
|||
|
conv[6] == p_features[2] and # B-DIP
|
|||
|
conv[7] == p_features[3] and # B-DPORT
|
|||
|
(conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
|
|||
|
p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
|
|||
|
)
|
|||
|
|
|||
|
out_matches = list(filter(out_f_s_d, outlist_features))
|
|||
|
|
|||
|
if len(out_matches) == 0:
|
|||
|
out_f_d_s = lambda conv: ( # outlist-pcap
|
|||
|
conv[6] == p_features[0] and # D-SIP
|
|||
|
conv[7] == p_features[1] and # D-SPORT
|
|||
|
conv[4] == p_features[2] and # S-DIP
|
|||
|
conv[5] == p_features[3] and # S-DPORT
|
|||
|
(conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
|
|||
|
p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
|
|||
|
)
|
|||
|
|
|||
|
out_matches = list(filter(out_f_d_s, outlist_features))
|
|||
|
|
|||
|
if len(out_matches) > 1:
|
|||
|
print('more than one out match identified')
|
|||
|
unidentified_packets += 1
|
|||
|
continue
|
|||
|
elif len(out_matches) == 0:
|
|||
|
print('zero out matches identified')
|
|||
|
unidentified_packets += 1
|
|||
|
continue
|
|||
|
|
|||
|
out_class = out_matches[0][8]
|
|||
|
|
|||
|
feat_vec = p_features + s_features + [out_class]
|
|||
|
features.append(feat_vec)
|
|||
|
|
|||
|
print("unidentified/skipped packets:", unidentified_packets)
|
|||
|
return features
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
ts = time.time()
|
|||
|
outlist_features = parse_tcp_outlist("./tcpdump.list", TIME_ZONE, TIME_PAD)
|
|||
|
stats_features = parse_tcp_stats("./test.tcp.csv", TIME_ZONE)
|
|||
|
tcp_packets = create_feature_vectors("./tcpdump", outlist_features, stats_features, TIME_ZONE)
|
|||
|
te = time.time()
|
|||
|
df = pd.DataFrame(tcp_packets)
|
|||
|
df.to_csv("out.csv")
|
|||
|
print(te-ts)
|