memesite/run.py

from backports import zoneinfo
import time
import datetime
import scapy_wrap
import pandas as pd

TIME_ZONE = zoneinfo.ZoneInfo('US/Eastern')
TIME_PAD = datetime.timedelta(minutes=15)
FLOAT_TOLERANCE = datetime.timedelta(seconds=0.01)


def determineHttp(packet):
    """
    Determine if a packet contains HTTP headers

    Parameters:
    ___________
    packet: a Pcapreader packet object

    Returns:
    ________
    0 or 1
    """
    if (packet["TCP"].sport == 80) or (packet["TCP"].dport == 80):
        return 1
    return 0


def determineTelnet(packet):
    """
    Determine if a packet contains TELNET headers

    Parameters:
    ___________
    packet: a Pcapreader packet object

    Returns:
    ________
    0 or 1
    """
    if (packet["TCP"].sport == 23) or (packet["TCP"].dport == 23):
        return 1
    return 0


def formatFlags(flag):
    """
    Process the flags for a given packet

    Parameters:
    __________
    flag: string representing TCP flags (ex. PA)

    Returns:
    _______
    array of binary features indicating presence of the 6 tcp flags
    """
    flags = [0, 0, 0, 0, 0, 0]
    if "U" in flag:
        flags[0] = 1
    if "A" in flag:
        flags[1] = 1
    if "P" in flag:
        flags[2] = 1
    if "R" in flag:
        flags[3] = 1
    if "S" in flag:
        flags[4] /= 1
    if "F" in flag:
        flags[5] = 1
    return flags


def clean_ipv4(ipv4_string):
    """
    Removes leading zeros from an IPv4 address.

    params:
        ipv4_string - the IPv4 address (str) to remove leading zeros from

    return:
        a string representation of the IPv4 address with leading zeros removed in each octet
    """
    return ".".join([parts if (len(parts:=octet.lstrip("0")))> 0 else "0" for octet in ipv4_string.split(".")])


def parse_tcp_outlist(tcp_out_file, time_zone, time_pad):
    """
    Parses the tcpout.list file for a dataset.

    params:
        tcp_out_file - the file to parse
        time_zone - a tzinfo object of the current time zone
        time_pad - a datetime object to pad conversation start/ends with

    return:
        A 2D numpy array with the folling features
            - padded_start,
            - padded_end,
            - start (unpadded)
            - end (unpadded)
            - src_ip
            - src_port
            - dst_ip
            - dst_port
            - class
    """
    with open(tcp_out_file, 'r') as infile:
        features = []
        for line in infile:
            try:
                line = line.strip().split()

                # Skip the line if it doesn't match our use case (i.e. no ports)
                if line[5] == '-' or line[7] == '-':
                    continue

                # Convert times to datetime objects in UTC
                start_time = datetime.datetime.strptime(f"{line[1]} {line[2]}", "%m/%d/%Y %H:%M:%S")
                start_time_utc = start_time - time_zone.utcoffset(start_time)
                start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
                time_delta = datetime.datetime.strptime(line[3], "%H:%M:%S")
                time_delta = datetime.timedelta(
                        hours=time_delta.hour,
                        minutes=time_delta.minute,
                        seconds=time_delta.second
                        )
                end_time_utc = start_time_utc + time_delta

                line_features = [
                        start_time_utc - time_pad,                  # 0
                        end_time_utc + time_pad,                    # 1
                        start_time_utc,                             # 2
                        end_time_utc,                               # 3
                        clean_ipv4(line[7]), # src ip               # 4
                        int(line[5]), # src port                    # 5
                        clean_ipv4(line[8]), # dst ip               # 6
                        int(line[6]), # dst port                    # 7
                        line[10] # class                            # 8
                        ]

                if line_features[8] == "-":
                    line_features[8] = ""

                features.append(line_features)
            except:
                pass
    return features


def parse_tcp_stats(tcp_conv_file, time_zone):
    """
    Parses TCP Conversation Stats file - the timezone of the capture (tzinfo) is required
    in order to generate proper timestamps.

    params:
        tcp_conv_file - a CSV of TCP conversation stats (
                            copied directly from wireshark GUI as CSV,
                            ensure that "Absolute Start Time" and
                            "Save Data as Raw" are checked
                        )
        pcap_file - the pcap file used to generate the stats

    returns:
        a feature vector of conversation stats in the following format
            - Address A
            - Port A
            - Address B
            - Port B
            - Packets
            - Bytes
            - Stream ID
            - Packets A â†’ B
            - Bytes A â†’ B
            - Packets B â†’ A
            - Bytes B â†’ A
            - UTC Start
            - UTC End
    """
    with open(tcp_conv_file, 'r') as infile:
        features = []
        for line in infile:
            line = line.strip().split(',')

            start_time = datetime.datetime.fromisoformat(line[13])
            start_time_utc = start_time - time_zone.utcoffset(start_time)
            start_time_utc = start_time_utc.replace(tzinfo=datetime.timezone.utc)
            duration = datetime.timedelta(seconds=float(line[14]))
            end_time_utc = start_time_utc + duration

            feature_vector = [
                    line[0].strip('"'),             # Address A
                    int(line[1]),                   # Port A
                    line[2].strip('"'),             # Address B
                    int(line[3]),                   # Port B
                    int(line[4]),                   # Packets
                    int(line[5]),                   # Bytes
                    int(line[6]),                   # Stream ID
                    int(line[9]),                   # Packets A â†’ B
                    int(line[10]),                  # Bytes A â†’ B
                    int(line[11]),                  # Packets B â†’ A
                    int(line[12]),                  # Bytes B â†’ A
                    start_time_utc,                 # UTC Start
                    end_time_utc,                   # UTC End
                    ]

            features.append(feature_vector)
    return features


def create_feature_vectors(pcap_file, outlist_features, stats_features, time_zone):
    """
    Creates feature vectors based on a pcap file and parsed outlist/statistics.
    Done during parsing as storing entire packets was space/time inefficient.

    params:
        pcap_file - the file to read
        outlist_features - features from the DARPA out list
        stats_features - stats from Wireshark TCP conversation stats

    return:
        a feature vector for each identifable packet in the following format
            - Source IP
            - Source Port
            - Destination IP
            - Destination Port
            - IP packet len
            - HTTP (boolean)
            - Telnet (boolean)
            - TCP - U
            - TCP - A
            - TCP - P
            - TCP - R
            - TCP - S
            - TCP - F
            - IP TTL
            - TCP Window
            - TCP Urgptr/
            - UTC Timestamp
            - Total packets in convo
            - Packets SRC to DST in convo
            - Packets DST to SRC in convo
            - Bytes STC to DST in convo
            - Bytes DST to SRC in convo
            - Class label
    """

    features = []
    unidentified_packets = 0

    for packet in scapy_wrap.PcapReader(pcap_file):

        if scapy_wrap.TCP not in packet:
            continue

        pkt_time = datetime.datetime.fromtimestamp(float(packet.time))
        pkt_time = pkt_time - time_zone.utcoffset(pkt_time)
        pkt_time = pkt_time.replace(tzinfo=datetime.timezone.utc)

        p_features = [
                packet["IP"].src,
                packet["TCP"].sport,
                packet["IP"].dst,
                packet["TCP"].dport,
                packet["IP"].len,
                0,  # Is http?
                0,  # Is telnet?
                0,  # TCP - U
                0,  # TCP - A
                0,  # TCP - P
                0,  # TCP - R
                0,  # TCP - S
                0,  # TCP - F
                packet["IP"].ttl,
                packet["TCP"].window,
                packet["TCP"].urgptr,
                pkt_time
                ]

        p_features[5] = determineHttp(packet) # set HTTP header presence
        if not p_features[5]:
            p_features[6] = determineTelnet(packet) # only look for TELNET if no HTTP

        p_features[7:13] = formatFlags(str(packet["TCP"].flags))

        # Test if this packet can be matched to a conversation in the Wireshark Stats
        # Define lambda for matching
        s_invert_a_b = False
        stats_f_a = lambda conv: (
                conv[0] == p_features[0] and    # A-SIP
                conv[1] == p_features[1] and    # A-SPORT
                conv[2] == p_features[2] and    # B-DIP
                conv[3] == p_features[3] and    # B-DPORT
                (conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
                p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
                )

        stats_matches = list(filter(stats_f_a, stats_features))

        if len(stats_matches) == 0:
            s_invert_a_b = True
            stats_f_b = lambda conv: (
                    conv[0] == p_features[2] and    # A-DIP
                    conv[1] == p_features[3] and    # A-DPORT
                    conv[2] == p_features[0] and    # B-SIP
                    conv[3] == p_features[1] and    # B-SPORT
                    (conv[11] - FLOAT_TOLERANCE) <= p_features[16] and
                    p_features[16] <= (conv[12] + FLOAT_TOLERANCE)
                    )

            stats_matches = list(filter(stats_f_b, stats_features))

        if len(stats_matches) > 1:
            print('more than one stats match identified')
            unidentified_packets += 1
            continue
        elif len(stats_matches) == 0:
            print('zero stat matches identified')
            unidentified_packets += 1
            continue

        stats_match = stats_matches[0]

        if not s_invert_a_b:    # A is the SOURCE and B is the DESTINATION
            s_features = [
                    stats_match[4],     # Total packets in convo
                    stats_match[7],     # Packets SRC to DST in convo
                    stats_match[9],     # Packets DST to SRC in convo
                    stats_match[8],     # Bytes STC to DST in convo
                    stats_match[10]     # Bytes DST to SRC in convo
                    ]
        else:                   # A is the DESTINATION and B is the SOURCE
            s_features = [
                    stats_match[4],     # Total packets in convo
                    stats_match[9],     # Packets SRC to DST in convo
                    stats_match[7],     # Packets DST to SRC in convo
                    stats_match[10],    # Bytes STC to DST in convo
                    stats_match[8]      # Bytes DST to SRC in convo
                    ]


            # Test if this packet can be matched to a conversation in out.list file
        o_invert_s_d = False
        out_f_s_d = lambda conv: (          # outlist-pcap
                conv[4] == p_features[0] and    # S-SIP
                conv[5] == p_features[1] and    # S-SPORT
                conv[6] == p_features[2] and    # B-DIP
                conv[7] == p_features[3] and    # B-DPORT
                (conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
                p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
                )

        out_matches = list(filter(out_f_s_d, outlist_features))

        if len(out_matches) == 0:
            out_f_d_s  = lambda conv: (         # outlist-pcap
                    conv[6] == p_features[0] and    # D-SIP
                    conv[7] == p_features[1] and    # D-SPORT
                    conv[4] == p_features[2] and    # S-DIP
                    conv[5] == p_features[3] and    # S-DPORT
                    (conv[0] - FLOAT_TOLERANCE) <= p_features[16] and
                    p_features[16] <= (conv[1] + FLOAT_TOLERANCE)
                    )

            out_matches = list(filter(out_f_d_s, outlist_features))

        if len(out_matches) > 1:
            print('more than one out match identified')
            unidentified_packets += 1
            continue
        elif len(out_matches) == 0:
            print('zero out matches identified')
            unidentified_packets += 1
            continue

        out_class = out_matches[0][8]

        feat_vec = p_features + s_features + [out_class]
        features.append(feat_vec)

    print("unidentified/skipped packets:", unidentified_packets)
    return features

if __name__ == "__main__":
    ts = time.time()
    outlist_features = parse_tcp_outlist("./tcpdump.list", TIME_ZONE, TIME_PAD)
    stats_features = parse_tcp_stats("./test.tcp.csv", TIME_ZONE)
    tcp_packets = create_feature_vectors("./tcpdump", outlist_features, stats_features, TIME_ZONE)
    te = time.time()
    df = pd.DataFrame(tcp_packets)
    df.to_csv("out.csv")
    print(te-ts)