diff --git a/src/navv/bll.py b/src/navv/bll.py index 37854be..7245ce8 100644 --- a/src/navv/bll.py +++ b/src/navv/bll.py @@ -1,12 +1,15 @@ +import json import os -from ipaddress import IPv4Address, IPv6Address import pandas as pd from navv.zeek import perform_zeekcut -from navv.utilities import get_mac_vendor +from navv.utilities import get_mac_vendor, timeit from navv.validators import is_ipv4_address, is_ipv6_address +MAC_VENDORS_JSON_FILE = os.path.abspath(__file__ + "/../" + "data/mac-vendors.json") + + def get_zeek_data(zeek_logs): """Return a list of Zeek conn.log data.""" return ( @@ -27,16 +30,31 @@ def get_zeek_data(zeek_logs): ) -def get_zeek_df(zeek_data: list): - """Return a pandas dataframe of the conn.log data.""" +def get_zeek_df(zeek_data: list, dns_data: dict): + """Return a pandas dataframe of the conn.log data with its dns data.""" zeek_data = [row.split("\t") for row in zeek_data] + # Insert dns data to zeek data + for row in zeek_data: + row.insert(1, dns_data.get(row[0], "")) + row.insert(3, dns_data.get(row[2], "")) return pd.DataFrame( zeek_data, - columns=["src_ip", "dst_ip", "port", "proto", "conn", "src_mac", "dst_mac"], + columns=[ + "src_ip", + "src_hostname", + "dst_ip", + "dst_hostname", + "port", + "proto", + "conn", + "src_mac", + "dst_mac", + ], ) +@timeit def get_inventory_report_df(zeek_df: pd.DataFrame): """Return a pandas dataframe of the inventory report data.""" zeek_df["port_and_proto"] = zeek_df["port"] + "/" + zeek_df["proto"] @@ -56,12 +74,30 @@ def get_inventory_report_df(zeek_df: pd.DataFrame): ) src_df = zeek_df[ - ["src_mac", "src_ipv4", "src_ipv6", "dst_ipv4", "dst_ipv6", "port_and_proto"] + [ + "src_mac", + "src_ipv4", + "src_hostname", + "src_ipv6", + "dst_ipv4", + "dst_hostname", + "dst_ipv6", + "port_and_proto", + ] ].reset_index(drop=True) src_df["mac"] = src_df["src_mac"] dst_df = zeek_df[ - ["dst_mac", "src_ipv4", "src_ipv6", "dst_ipv4", "dst_ipv6", "port_and_proto"] + [ + "dst_mac", + "src_ipv4", + "src_hostname", + "src_ipv6", + "dst_ipv4", + "dst_hostname", + "dst_ipv6", + "port_and_proto", + ] ].reset_index(drop=True) dst_df["mac"] = dst_df["dst_mac"] @@ -70,33 +106,60 @@ def get_inventory_report_df(zeek_df: pd.DataFrame): .reset_index(drop=True) .drop(columns=["src_mac", "dst_mac"]) .drop_duplicates( - subset=["src_ipv4", "src_ipv6", "dst_ipv4", "dst_ipv6", "port_and_proto"] + subset=[ + "src_ipv4", + "src_hostname", + "src_ipv6", + "dst_ipv4", + "dst_hostname", + "dst_ipv6", + "port_and_proto", + ] ) ) - df["vendor"] = df["mac"].apply(lambda mac: get_mac_vendor(mac)) grouped_df = ( df.groupby("mac", as_index=False) .agg( { "src_ipv4": list, + "src_hostname": list, "src_ipv6": list, "dst_ipv4": list, + "dst_hostname": list, "dst_ipv6": list, "port_and_proto": list, } ) .reset_index() ) - grouped_df["vendor"] = grouped_df["mac"].apply(lambda mac: get_mac_vendor(mac)) + + mac_vendors = {} + with open(MAC_VENDORS_JSON_FILE) as f: + mac_vendors = json.load(f) + grouped_df["vendor"] = grouped_df["mac"].apply( + lambda mac: get_mac_vendor(mac_vendors, mac) + ) grouped_df["ipv4"] = (grouped_df["src_ipv4"] + grouped_df["dst_ipv4"]).apply( lambda ip: list(set(ip)) ) grouped_df["ipv6"] = (grouped_df["src_ipv6"] + grouped_df["dst_ipv6"]).apply( lambda ip: list(set(ip)) ) + grouped_df["hostname"] = ( + grouped_df["src_hostname"] + grouped_df["dst_hostname"] + ).apply(lambda hostname: list(set(hostname))) + grouped_df.drop( - columns=["src_ipv4", "src_ipv6", "dst_ipv4", "dst_ipv6"], inplace=True + columns=[ + "src_ipv4", + "src_hostname", + "src_ipv6", + "dst_ipv4", + "dst_hostname", + "dst_ipv6", + ], + inplace=True, ) return grouped_df diff --git a/src/navv/commands.py b/src/navv/commands.py index 4ab00ff..754d5f8 100644 --- a/src/navv/commands.py +++ b/src/navv/commands.py @@ -1,5 +1,4 @@ """CLI Commands.""" -import json import os import webbrowser @@ -22,12 +21,11 @@ write_conn_states_sheet, write_externals_sheet, write_inventory_report_sheet, - write_macs_sheet, write_stats_sheet, write_unknown_internals_sheet, ) -from navv.zeek import run_zeek, perform_zeekcut -from navv.utilities import pushd, trim_dns_data +from navv.zeek import get_dns_data, run_zeek, perform_zeekcut +from navv.utilities import pushd @click.command("generate") @@ -73,26 +71,21 @@ def generate(customer_name, output_dir, pcap, zeek_logs): # Get zeek data zeek_data = get_zeek_data(zeek_logs) - zeek_df = get_zeek_df(zeek_data) + + # Get dns data for resolution + json_path = os.path.join(output_dir, f"{customer_name}_dns_data.json") + + # Get dns data from zeek logs + dns_filtered = get_dns_data(customer_name, output_dir, zeek_logs) + + # Get zeek dataframe + zeek_df = get_zeek_df(zeek_data, dns_filtered) # Get inventory report dataframe inventory_df = get_inventory_report_df(zeek_df) # Turn zeekcut data into rows for spreadsheet - rows, mac_dict = create_analysis_array(zeek_data, timer=timer_data) - - # Get dns data for resolution - json_path = os.path.join(output_dir, f"{customer_name}_dns_data.json") - - if os.path.exists(json_path): - with open(json_path, "rb") as json_file: - dns_filtered = json.load(json_file) - else: - dns_data = perform_zeekcut( - fields=["query", "answers", "qtype", "rcode_name"], - log_file=os.path.join(zeek_logs, "dns.log"), - ) - dns_filtered = trim_dns_data(dns_data) + rows = create_analysis_array(zeek_data, timer=timer_data) ext_IPs = set() unk_int_IPs = set() @@ -112,8 +105,6 @@ def generate(customer_name, output_dir, pcap, zeek_logs): write_inventory_report_sheet(inventory_df, wb) - write_macs_sheet(mac_dict, wb) - write_externals_sheet(ext_IPs, wb) write_unknown_internals_sheet(unk_int_IPs, wb) diff --git a/src/navv/network_analysis.py b/src/navv/network_analysis.py index cbeaae8..336e549 100644 --- a/src/navv/network_analysis.py +++ b/src/navv/network_analysis.py @@ -11,7 +11,6 @@ # package imports from navv.commands import generate, launch from navv.message_handler import info_msg -from navv import utilities from navv._version import __version__ @@ -31,7 +30,6 @@ def cli(ctx): pass -@utilities.timeit def main(): """Main function for performing zeek-cut commands and sorting the output""" diff --git a/src/navv/spreadsheet_tools.py b/src/navv/spreadsheet_tools.py index f4a272b..5914441 100644 --- a/src/navv/spreadsheet_tools.py +++ b/src/navv/spreadsheet_tools.py @@ -19,8 +19,8 @@ from tqdm import tqdm from navv import data_types -from navv import utilities -from navv.message_handler import info_msg +from navv.utilities import timeit +from navv.message_handler import warning_msg DATA_PKL_FILE = pkg_resources.resource_filename(__name__, "data/data.pkl") @@ -83,7 +83,7 @@ def get_workbook(file_name): return wb -@utilities.timeit +@timeit def get_inventory_data(ws, **kwargs): inventory = dict() for row in itertools.islice(ws.iter_rows(), 1, None): @@ -97,7 +97,7 @@ def get_inventory_data(ws, **kwargs): return inventory -@utilities.timeit +@timeit def get_segments_data(ws): segments = list() for row in itertools.islice(ws.iter_rows(), 1, None): @@ -125,10 +125,9 @@ def get_package_data(): return services, conn_states -@utilities.timeit +@timeit def create_analysis_array(sort_input, **kwargs): arr = [] - mac_dict = {} # sort by count and source IP counted = sorted( list( @@ -150,18 +149,11 @@ def create_analysis_array(sort_input, **kwargs): conn=cells[5], ) ) - if cells[6] is not None and cells[6] != "": - if netaddr.IPAddress(cells[1]).is_private(): - if cells[6] not in mac_dict: - mac_dict[cells[6]] = [cells[1]] - else: - if cells[1] not in mac_dict[cells[6]]: - mac_dict[cells[6]].append(cells[1]) - return arr, mac_dict + return arr -@utilities.timeit +@timeit def perform_analysis( wb, rows, @@ -190,7 +182,7 @@ def perform_analysis( "Notes", ] ) - info_msg("Performing analysis(including lookups). This may take a while:") + warning_msg("this may take awhile...") for row_index, row in enumerate(tqdm(rows), start=2): row.src_desc = handle_ip( row.src_ip, dns_data, inventory, segments, ext_IPs, unk_int_IPs @@ -333,54 +325,67 @@ def handle_ip(ip_to_check, dns_data, inventory, segments, ext_IPs, unk_int_IPs): def write_conn_states_sheet(conn_states, wb): - new_ws = make_sheet(wb, "Conn_States", idx=8) + new_ws = make_sheet(wb, "Conn States", idx=8) new_ws.append(["State", "Description"]) - for row_num, conn_state in enumerate(conn_states, start=2): - state_cell = new_ws[f"A{row_num}"] - desc_cell = new_ws[f"B{row_num}"] + for index, conn_state in enumerate(conn_states, start=2): + # State column + state_cell = new_ws[f"A{index}"] state_cell.value = conn_state state_cell.fill = conn_states[conn_state][0] state_cell.font = conn_states[conn_state][1] + + # Description column + desc_cell = new_ws[f"B{index}"] + desc_cell.alignment = openpyxl.styles.Alignment(wrap_text=True) desc_cell.value = conn_states[conn_state][2] desc_cell.fill = conn_states[conn_state][0] desc_cell.font = conn_states[conn_state][1] - auto_adjust_width(new_ws) + auto_adjust_width(new_ws, 100) def write_inventory_report_sheet(inventory_df, wb): """Get Mac Addresses with their associated IP addresses and manufacturer.""" ir_sheet = make_sheet(wb, "Inventory Report", idx=4) - ir_sheet.append(["MAC", "Vendor", "IPv4", "IPv6", "Port and Proto"]) + ir_sheet.append(["MAC", "Vendor", "Hostname", "IPv4", "IPv6", "Port and Proto"]) inventory_data = inventory_df.to_dict(orient="records") for index, row in enumerate(inventory_data, start=2): - # Mac Address column + # Mac column ir_sheet[f"A{index}"].value = row["mac"] # Vendor column ir_sheet[f"B{index}"].value = row["vendor"] + # Hostname column + hostname_column = ir_sheet[f"C{index}"] + hostname_column.alignment = openpyxl.styles.Alignment(wrap_text=True) + + hostname = "" + if row["hostname"]: + hostname = ", ".join(each for each in row["hostname"] if each) + hostname_column.value = hostname + # IPv4 Address column - ipv4_list_cell = ir_sheet[f"C{index}"] - ipv4_list_cell.alignment = openpyxl.styles.Alignment(wrap_text=True) + ipv4_column = ir_sheet[f"D{index}"] + ipv4_column.alignment = openpyxl.styles.Alignment(wrap_text=True) ipv4 = "" if row["ipv4"]: ipv4 = ", ".join(each for each in row["ipv4"] if each) - ipv4_list_cell.value = ipv4 + ipv4_column.value = ipv4 # IPv6 Address column - ipv6_list_cell = ir_sheet[f"D{index}"] - ipv6_list_cell.alignment = openpyxl.styles.Alignment(wrap_text=True) + ipv6_column = ir_sheet[f"E{index}"] + ipv6_column.alignment = openpyxl.styles.Alignment(wrap_text=True) ipv6 = "" if row["ipv6"]: ipv6 = ", ".join(each for each in row["ipv6"] if each) - ipv6_list_cell.value = ipv6 + ipv6_column.value = ipv6 # Port and Protocol column - pnp_sheet = ir_sheet[f"E{index}"] - pnp_sheet.alignment = openpyxl.styles.Alignment(wrap_text=True) + pnp_column = ir_sheet[f"F{index}"] + pnp_column.alignment = openpyxl.styles.Alignment(wrap_text=True) port_and_proto = "" if row["port_and_proto"]: @@ -388,48 +393,14 @@ def write_inventory_report_sheet(inventory_df, wb): list(set(each for each in row["port_and_proto"] if each))[:10] ) - pnp_sheet.value = port_and_proto + pnp_column.value = port_and_proto # Add styling to every other row if index % 2 == 0: for cell in ir_sheet[f"{index}:{index}"]: cell.fill = openpyxl.styles.PatternFill("solid", fgColor="AAAAAA") - auto_adjust_width(ir_sheet) - ir_sheet.column_dimensions["C"].width = 39 * 1.2 - - -def write_macs_sheet(mac_dict, wb): - """Fill spreadsheet with MAC address -> IP address translation with manufacturer information""" - macs_sheet = make_sheet(wb, "MACs", idx=4) - macs_sheet.append(["MAC", "Manufacturer", "IPs"]) - for row_index, mac in enumerate(mac_dict, start=2): - macs_sheet[f"A{row_index}"].value = mac - try: - eui = netaddr.EUI(mac) - oui = eui.oui - orgs = [oui.registration(i).org for i in range(oui.reg_count)] - except netaddr.core.NotRegisteredError: - orgs = ["Not a registered manufacturer"] - except netaddr.core.AddrFormatError: - orgs = [f"Bad MAC address {mac}"] - except Exception: - orgs = ["Unspecified MAC error"] - macs_sheet[f"B{row_index}"].value = "\n".join(orgs) - ip_list_cell = macs_sheet[f"C{row_index}"] - ip_list_cell.alignment = openpyxl.styles.Alignment(wrap_text=True) - num_ips = len(mac_dict[mac]) - if num_ips > 10: - display_list = mac_dict[mac][:10] - display_list.append(f"Displaying 10 IPs of {num_ips}") - ip_list_cell.value = "\n".join(display_list) - else: - ip_list_cell.value = "\n".join(mac_dict[mac][:10]) - macs_sheet.row_dimensions[row_index].height = min(num_ips, 11) * 15 - if row_index % 2 == 0: - for cell in macs_sheet[f"{row_index}:{row_index}"]: - cell.fill = openpyxl.styles.PatternFill("solid", fgColor="AAAAAA") - auto_adjust_width(macs_sheet) - macs_sheet.column_dimensions["C"].width = 39 * 1.2 + auto_adjust_width(ir_sheet, 40) + # ir_sheet.column_dimensions.width = 39 * 1.2 def write_externals_sheet(IPs, wb): @@ -444,7 +415,7 @@ def write_externals_sheet(IPs, wb): def write_unknown_internals_sheet(IPs, wb): - int_sheet = make_sheet(wb, "Unkown_Internals", idx=6) + int_sheet = make_sheet(wb, "Unknown Internals", idx=6) int_sheet.append(["Unknown Internal IP"]) for row_index, IP in enumerate(sorted(IPs), start=2): cell = int_sheet[f"A{row_index}"] @@ -473,11 +444,10 @@ def make_sheet(wb, sheet_name, idx=None): return wb.create_sheet(sheet_name, index=idx) -def auto_adjust_width(sheet): - factor = 1.7 +def auto_adjust_width(sheet, width=40): + """Adjust the width of the columns to fit the data""" for col in sheet.columns: - vals = (len("{}".format(c.value)) for c in col if c.value is not None) - max_width = max(vals) * factor + max_width = max(len(f"{c.value}") for c in col if c.value) + 2 sheet.column_dimensions[col[0].column_letter].width = ( - max_width if max_width < 20 else max_width * 1.2 / 1.7 + width if width < max_width else max_width ) diff --git a/src/navv/utilities.py b/src/navv/utilities.py index 8787b13..99b3e6c 100644 --- a/src/navv/utilities.py +++ b/src/navv/utilities.py @@ -1,21 +1,17 @@ #!/usr/bin/env python3 # Copyright 2023 Battelle Energy Alliance, LLC - import os import contextlib -import json -import time +from functools import wraps +from time import monotonic from tqdm import tqdm -from navv.message_handler import info_msg, error_msg +from navv.message_handler import info_msg, success_msg, error_msg from navv.validators import is_mac_address -MAC_VENDORS_JSON_FILE = os.path.abspath(__file__ + "/../" + "data/mac-vendors.json") - - @contextlib.contextmanager def pushd(new_dir): previous_dir = os.getcwd() @@ -28,20 +24,18 @@ def pushd(new_dir): os.chdir(previous_dir) -def timeit(method): - def timed(*args, **kw): - ts = time.time() - result = method(*args, **kw) - td = time.time() - ts - time_elapsed = f"{method.__name__}:\n\tHours: {int(int(td / 3600) % 24)}\n\tMinutes: {int(int(td / 60) % 60)}\n\tSeconds: {int(td % 60)}" - if "timer" in kw: - kw["timer"][ - method.__name__ - ] = f"{int(td/86400)} day(s) {int(td%86400/3600)} hour(s) {int(td%3600/60)} minutes {int(td%60)} seconds" - info_msg(time_elapsed) - return result +def timeit(func): + @wraps(func) + def _timeit(*args, **kwargs): + start = monotonic() + try: + info_msg(f"running {func.__name__}") + return func(*args, **kwargs) + finally: + end = monotonic() + success_msg(f"{func.__name__} execution time:\n{end - start:0.2f} seconds") - return timed + return _timeit def trim_dns_data(data): @@ -58,7 +52,7 @@ def trim_dns_data(data): return ret_data -def get_mac_vendor(mac_address: str) -> str: +def get_mac_vendor(mac_vendors: dict, mac_address: str) -> str: """Return the vendor of the MAC address.""" mac_address = mac_address.upper() @@ -66,9 +60,6 @@ def get_mac_vendor(mac_address: str) -> str: error_msg(f"Invalid MAC address: {mac_address}") return f"Bad MAC address {mac_address}" - with open(MAC_VENDORS_JSON_FILE) as f: - mac_vendors = json.load(f) - try: vendor = [ vendor["vendorName"] diff --git a/src/navv/zeek.py b/src/navv/zeek.py index 36d5a3e..877dc55 100644 --- a/src/navv/zeek.py +++ b/src/navv/zeek.py @@ -1,7 +1,9 @@ +import json +import os from subprocess import Popen, PIPE, STDOUT, check_call from navv.message_handler import error_msg -from navv.utilities import pushd, timeit +from navv.utilities import pushd, timeit, trim_dns_data @timeit @@ -14,6 +16,21 @@ def run_zeek(pcap_path, zeek_logs_path, **kwargs): error_msg(e) +@timeit +def get_dns_data(customer_name, output_dir, zeek_logs): + """Get DNS data from zeek logs or from a json file if it exists""" + json_path = os.path.join(output_dir, f"{customer_name}_dns_data.json") + if os.path.exists(json_path): + with open(json_path, "rb") as json_file: + return json.load(json_file) + + dns_data = perform_zeekcut( + fields=["query", "answers", "qtype", "rcode_name"], + log_file=os.path.join(zeek_logs, "dns.log"), + ) + return trim_dns_data(dns_data) + + def perform_zeekcut(fields, log_file): """Perform the call to zeek-cut with the identified fields on the specified log file""" try: