memesite/run.py
2023-07-04 21:10:24 -04:00

393 lines
13 KiB
Python
Executable file

from backports import zoneinfo
import time
import datetime
import scapy_wrap
import pandas as pd
TIME_ZONE = zoneinfo.ZoneInfo('US/Eastern')
TIME_PAD = datetime.timedelta(minutes=15)
FLOAT_TOLERANCE = datetime.timedelta(seconds=0.01)
def determineHttp(packet):
"""
Determine if a packet contains HTTP headers
Parameters:
___________
packet: a Pcapreader packet object
Returns:
________
0 or 1
"""
if (packet["TCP"].sport == 80) or (packet["TCP"].dport == 80):
return 1
return 0
def determineTelnet(packet):
"""
Determine if a packet contains TELNET headers
Parameters:
___________
packet: a Pcapreader packet object
Returns:
________
0 or 1
"""
if (packet["TCP"].sport == 23) or (packet["TCP"].dport == 23):
return 1
return 0
def formatFlags(flag):
"""
Process the flags for a given packet
Parameters:
__________
flag: string representing TCP flags (ex. PA)
Returns:
_______
array of binary features indicating presence of the 6 tcp flags
"""
flags = [0, 0, 0, 0, 0, 0]
if "U" in flag:
flags[0] = 1
if "A" in flag:
flags[1] = 1
if "P" in flag:
flags[2] = 1
if "R" in flag:
flags[3] = 1
if "S" in flag:
flags[4] /= 1
if "F" in flag:
flags[5] = 1
return flags
def clean_ipv4(ipv4_string):
"""
Removes leading zeros from an IPv4 address.
params:
ipv4_string - the IPv4 address (str) to remove leading zeros from
return:
a string representation of the IPv4 address with leading zeros removed in each octet
"""
return ".".join([parts if (len(parts:=octet.lstrip("0")))> 0 else "0" for octet in ipv4_string.split(".")])
def parse_tcp_outlist(tcp_out_file, time_zone, time_pad):
"""
Parses the tcpout.list file for a dataset.
params:
tcp_out_file - the file to parse
time_zone - a tzinfo object of the current time zone
time_pad - a datetime object to pad conversation start/ends with
return:
A 2D numpy array with the folling features
- padded_start,
- padded_end,
- start (unpadded)
- end (unpadded)
- src_ip
- src_port
- dst_ip
- dst_port
- class
"""
with open(tcp_out_file, 'r') as infile:
features = []
for line in infile:
try:
line = line.strip().split()
# Skip the line if it doesn't match our use case (i.e. no ports)
if line[5] == '-' or line[7] == '-':
continue
# Convert times to datetime objects in UTC
start_time = datetime.datetime.strptime(f"{line[1]} {line[2]}", "%m/%d/%Y %H:%M:%S")
start_time_utc = start_time - time_zone.utcoffset(start_time)
start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
time_delta = datetime.datetime.strptime(line[3], "%H:%M:%S")
time_delta = datetime.timedelta(
hours=time_delta.hour,
minutes=time_delta.minute,
seconds=time_delta.second
)
end_time_utc = start_time_utc + time_delta
line_features = [
start_time_utc - time_pad, # 0
end_time_utc + time_pad, # 1
start_time_utc, # 2
end_time_utc, # 3
clean_ipv4(line[7]), # src ip # 4
int(line[5]), # src port # 5
clean_ipv4(line[8]), # dst ip # 6
int(line[6]), # dst port # 7
line[10] # class # 8
]
if line_features[8] == "-":
line_features[8] = ""
features.append(line_features)
except:
pass
return features
def parse_tcp_stats(tcp_conv_file, time_zone):
"""
Parses TCP Conversation Stats file - the timezone of the capture (tzinfo) is required
in order to generate proper timestamps.
params:
tcp_conv_file - a CSV of TCP conversation stats (
copied directly from wireshark GUI as CSV,
ensure that "Absolute Start Time" and
"Save Data as Raw" are checked
)
pcap_file - the pcap file used to generate the stats
returns:
a feature vector of conversation stats in the following format
- Address A
- Port A
- Address B
- Port B
- Packets
- Bytes
- Stream ID
- Packets A → B
- Bytes A → B
- Packets B → A
- Bytes B → A
- UTC Start
- UTC End
"""
with open(tcp_conv_file, 'r') as infile:
features = []
for line in infile:
line = line.strip().split(',')
start_time = datetime.datetime.fromisoformat(line[13])
start_time_utc = start_time - time_zone.utcoffset(start_time)
start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
duration = datetime.timedelta(seconds=float(line[14]))
end_time_utc = start_time_utc + duration
feature_vector = [
line[0].strip('"'), # Address A
int(line[1]), # Port A
line[2].strip('"'), # Address B
int(line[3]), # Port B
int(line[4]), # Packets
int(line[5]), # Bytes
int(line[6]), # Stream ID
int(line[9]), # Packets A → B
int(line[10]), # Bytes A → B
int(line[11]), # Packets B → A
int(line[12]), # Bytes B → A
start_time_utc, # UTC Start
end_time_utc, # UTC End
]
features.append(feature_vector)
return features
def create_feature_vectors(pcap_file, outlist_features, stats_features, time_zone):
"""
Creates feature vectors based on a pcap file and parsed outlist/statistics.
Done during parsing as storing entire packets was space/time inefficient.
params:
pcap_file - the file to read
outlist_features - features from the DARPA out list
stats_features - stats from Wireshark TCP conversation stats
return:
a feature vector for each identifable packet in the following format
- Source IP
- Source Port
- Destination IP
- Destination Port
- IP packet len
- HTTP (boolean)
- Telnet (boolean)
- TCP - U
- TCP - A
- TCP - P
- TCP - R
- TCP - S
- TCP - F
- IP TTL
- TCP Window
- TCP Urgptr/
- UTC Timestamp
- Total packets in convo
- Packets SRC to DST in convo
- Packets DST to SRC in convo
- Bytes STC to DST in convo
- Bytes DST to SRC in convo
- Class label
"""
features = []
unidentified_packets = 0
for packet in scapy_wrap.PcapReader(pcap_file):
if scapy_wrap.TCP not in packet:
continue
pkt_time = datetime.datetime.fromtimestamp(float(packet.time))
pkt_time = pkt_time - time_zone.utcoffset(pkt_time)
pkt_time = pkt_time.replace(tzinfo=datetime.timezone.utc)
p_features = [
packet["IP"].src,
packet["TCP"].sport,
packet["IP"].dst,
packet["TCP"].dport,
packet["IP"].len,
0, # Is http?
0, # Is telnet?
0, # TCP - U
0, # TCP - A
0, # TCP - P
0, # TCP - R
0, # TCP - S
0, # TCP - F
packet["IP"].ttl,
packet["TCP"].window,
packet["TCP"].urgptr,
pkt_time
]
p_features[5] = determineHttp(packet) # set HTTP header presence
if not p_features[5]:
p_features[6] = determineTelnet(packet) # only look for TELNET if no HTTP
p_features[7:13] = formatFlags(str(packet["TCP"].flags))
# Test if this packet can be matched to a conversation in the Wireshark Stats
# Define lambda for matching
s_invert_a_b = False
stats_f_a = lambda conv: (
conv[0] == p_features[0] and # A-SIP
conv[1] == p_features[1] and # A-SPORT
conv[2] == p_features[2] and # B-DIP
conv[3] == p_features[3] and # B-DPORT
(conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
)
stats_matches = list(filter(stats_f_a, stats_features))
if len(stats_matches) == 0:
s_invert_a_b = True
stats_f_b = lambda conv: (
conv[0] == p_features[2] and # A-DIP
conv[1] == p_features[3] and # A-DPORT
conv[2] == p_features[0] and # B-SIP
conv[3] == p_features[1] and # B-SPORT
(conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
)
stats_matches = list(filter(stats_f_b, stats_features))
if len(stats_matches) > 1:
print('more than one stats match identified')
unidentified_packets += 1
continue
elif len(stats_matches) == 0:
print('zero stat matches identified')
unidentified_packets += 1
continue
stats_match = stats_matches[0]
if not s_invert_a_b: # A is the SOURCE and B is the DESTINATION
s_features = [
stats_match[4], # Total packets in convo
stats_match[7], # Packets SRC to DST in convo
stats_match[9], # Packets DST to SRC in convo
stats_match[8], # Bytes STC to DST in convo
stats_match[10] # Bytes DST to SRC in convo
]
else: # A is the DESTINATION and B is the SOURCE
s_features = [
stats_match[4], # Total packets in convo
stats_match[9], # Packets SRC to DST in convo
stats_match[7], # Packets DST to SRC in convo
stats_match[10], # Bytes STC to DST in convo
stats_match[8] # Bytes DST to SRC in convo
]
# Test if this packet can be matched to a conversation in out.list file
o_invert_s_d = False
out_f_s_d = lambda conv: ( # outlist-pcap
conv[4] == p_features[0] and # S-SIP
conv[5] == p_features[1] and # S-SPORT
conv[6] == p_features[2] and # B-DIP
conv[7] == p_features[3] and # B-DPORT
(conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
)
out_matches = list(filter(out_f_s_d, outlist_features))
if len(out_matches) == 0:
out_f_d_s = lambda conv: ( # outlist-pcap
conv[6] == p_features[0] and # D-SIP
conv[7] == p_features[1] and # D-SPORT
conv[4] == p_features[2] and # S-DIP
conv[5] == p_features[3] and # S-DPORT
(conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
)
out_matches = list(filter(out_f_d_s, outlist_features))
if len(out_matches) > 1:
print('more than one out match identified')
unidentified_packets += 1
continue
elif len(out_matches) == 0:
print('zero out matches identified')
unidentified_packets += 1
continue
out_class = out_matches[0][8]
feat_vec = p_features + s_features + [out_class]
features.append(feat_vec)
print("unidentified/skipped packets:", unidentified_packets)
return features
if __name__ == "__main__":
ts = time.time()
outlist_features = parse_tcp_outlist("./tcpdump.list", TIME_ZONE, TIME_PAD)
stats_features = parse_tcp_stats("./test.tcp.csv", TIME_ZONE)
tcp_packets = create_feature_vectors("./tcpdump", outlist_features, stats_features, TIME_ZONE)
te = time.time()
df = pd.DataFrame(tcp_packets)
df.to_csv("out.csv")
print(te-ts)