diff --git a/.gitignore b/.gitignore index 7b43f6d..3f0e130 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ checker-framework* jdk* .vscode/ + +.idea + +*config.properties \ No newline at end of file diff --git a/ashe_scripts/README.MD b/ashe_scripts/README.MD new file mode 100644 index 0000000..20902e5 --- /dev/null +++ b/ashe_scripts/README.MD @@ -0,0 +1,10 @@ +# Specimin Statistics and Exception Ranking + +### specimin_statistics.py +The script to parse the ASHE log files and generate statistical data from Specimin's minimization process. + +### specimin_exception_ranking.py +The script to parse the ASHE log files and generate a ranking of the exceptions that occurred during the minimization process. + +### run_ashe_for_stats.py +The script that clones ASHE, builds and runs it, and then runs the specimin_statistics.py and specimin_exception_rank.py scripts. \ No newline at end of file diff --git a/ashe_scripts/run_ashe_for_stats.py b/ashe_scripts/run_ashe_for_stats.py new file mode 100644 index 0000000..3dde829 --- /dev/null +++ b/ashe_scripts/run_ashe_for_stats.py @@ -0,0 +1,107 @@ +""" +Script to run Ashe.RepositoryAutomationEngine and Specimin scripts to analyze the log file generated by ASHE in dryrun mode. +https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints + +Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips +Date: April 13, 2024 + +Usage: +python3 run_ashe_for_stats.py +""" +import subprocess +import sys +import threading +import datetime +import time +import os + + +def run(ashe_path: str, csv_path: str, clone_path: str, props_file_path: str): + """ + Run ASHE and Specimin scripts to analyze the log file. + Args: + ashe_path: absolute path to clone the ASHE repository + csv_path: absolute path to the CSV file containing the repositories ASHE will iterate over + clone_path: absolute path to clone the repositories in the CSV file ASHE will iterate over + props_file_path: absolute path to the directory containing the config.properties files for ASHE + """ + + ashe_url: str = "https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints" + # clone or update repository + __git_clone_or_update(ashe_url, ashe_path) + + start_time: datetime = datetime.datetime.now() + status_thread: threading.Thread = threading.Thread(target=__print_ashe_runtime, args=(start_time,)) + status_thread.daemon = True + status_thread.start() + __build_and_run_ashe(csv_path, clone_path, props_file_path, working_dir=ashe_path) + + current_dir = os.path.dirname(os.path.abspath(__file__)) + stats_script = os.path.join(current_dir, 'specimin_statistics.py') + rank_script = os.path.join(current_dir, 'specimin_exception_rank.py') + + # run Specimin scripts + log_path: str = os.path.join(ashe_path, "logs", "app.log") + print("Running statistics script...") + __run_command(f"python3 {stats_script} {log_path}") + + print("Running exception rank script...") + __run_command(f"python3 {rank_script} {log_path}") + + +def __run_command(command, working_dir=None): + try: + result = subprocess.run(command, cwd=working_dir, shell=True, check=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + print(result.stdout.decode()) + except subprocess.CalledProcessError as e: + print("Error executing command:", e.stderr.decode()) + + +def __git_clone_or_update(repo_url, ashe_path): + """Clone or update the git repository.""" + if not os.path.exists(ashe_path): + print("Cloning the repository...") + __run_command(f"git clone {repo_url} {ashe_path}") + else: + print("Repository exists. Checking if it's a Git repository...") + if not os.path.exists(os.path.join(ashe_path, '.git')): + print(f"The directory {ashe_path} is not a Git repository.") + __run_command(f"git clone {repo_url} {ashe_path}") + else: + print("Updating the repository...") + os.chdir(ashe_path) + __run_command("git pull") + + +def __build_and_run_ashe(csv_path: str, clone_path: str, props_file_path: str, working_dir: str): + """Build and run the ASHE project using gradle.""" + # build ASHE + build_command: str = './gradlew build' + model_type: str = "dryrun" + run_automation_command: str = f"./gradlew runRepositoryAutomation -PrepositoriesCsvPath=\"{csv_path}\" -PcloneDirectory=\"{clone_path}\" -Pllm=\"{model_type}\" -PpropsFilePath=\"{props_file_path}\"" + + print("Building ASHE...") + __run_command(build_command, working_dir=working_dir) + + print("Running ASHE...") + __run_command(run_automation_command, working_dir=working_dir) + + +def __print_ashe_runtime(start_time): + """Function to print the elapsed time since ASHE started.""" + print("ASHE started.") + print("ASHE runtime: 00:00:00") + while True: + time.sleep(300) # sleep for 5 minute + elapsed_time = datetime.datetime.now() - start_time + # format elapsed time into H:M:S + formatted_time = str(elapsed_time).split('.')[0] # remove microseconds + print(f"ASHE runtime: {formatted_time}") + + +if __name__ == "__main__": + if len(sys.argv) < 4: + print("Usage: python3 run_ashe_for_stats.py ") + sys.exit(1) + run(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/ashe_scripts/specimin_exception_rank.py b/ashe_scripts/specimin_exception_rank.py new file mode 100644 index 0000000..ff8fcf7 --- /dev/null +++ b/ashe_scripts/specimin_exception_rank.py @@ -0,0 +1,125 @@ +""" +Script for analyzing log files generated by ASHE in dryrun mode. +https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints + +Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips +Date: April 13, 2024 + +Description: +This script reads a log file and ranks the exceptions by how frequently they occur. If the exceptions +occur more often, they are ranked higher. These exception rankings come from running the +Ashe.RepositoryAutomationEngine in dryrun mode. + +Output: +Rankings written to a txt file in the same directory as the provided log file. + +Usage: +python3 specimin_exception_rank.py +""" + +import sys +import os +import re +from collections import defaultdict + + +def analyze_log(file_path: str): + directory = os.path.dirname(file_path) + output_file_path = os.path.join(directory, 'specimin_exception_rank.txt') + + with open(file_path, 'r') as file: + content = file.readlines() + + exceptions = __extract_exceptions(content) + ranked_exceptions = __rank_exceptions(exceptions) + + __write_ranked_exceptions(ranked_exceptions, output_file_path) + print("Write successful") + + +def __extract_exceptions(log_lines): + """ + Extract exceptions from the log lines. An exception is defined as a line that starts with "Exception in thread" + Args: + log_lines: A list of log lines + + Returns: A list of tuples (name, message, example_line) + """ + # Enhanced to capture an example line following the exception message + exception_pattern = re.compile(r'^Exception in thread ".*?" (\w+.*?):(.*?)(?=\n\S|\Z)', re.DOTALL) + context_pattern = re.compile(r'^\s+at (.+)$', re.MULTILINE) + exceptions = [] + for i, line in enumerate(log_lines): + match = exception_pattern.search(line) + if match: + exception_name, message = match.groups() + # find the next line that starts with whitespace followed by "at" to capture the context + context_match = context_pattern.search(log_lines[i + 1] if i + 1 < len(log_lines) else "") + example_line = context_match.group(1).strip() if context_match else "No code context available" + exceptions.append([exception_name.strip(), message.strip(), example_line]) + return exceptions + + +def __rank_exceptions(exceptions): + """ + Rank the exceptions by how frequently they occur. If the exceptions occur more often, they are ranked higher. + Args: + exceptions: A list of tuples (name, message, example_line) + + Returns: A sorted list of tuples (count, examples, name, message) + """ + grouped_exceptions = defaultdict(list) + for name, message, example in exceptions: + simplified_message = simplify_message(message) + grouped_exceptions[(name, simplified_message)].append(example) + + # convert grouped data into a sorted list of tuples (count, examples, name, message) + sorted_exceptions = sorted(((len(v), v, k[0], k[1]) for k, v in grouped_exceptions.items()), reverse=True, + key=lambda x: x[0]) + return sorted_exceptions + + +def simplify_message(message): + """ + Simplify the exception message by removing certain patterns that are not helpful for distinguishing exceptions. + Args: + message: The exception message for Specimin developers to analyze + + Returns: A simplified version of the message + """ + message = re.sub(r'\bat [\w\.$<>]+\(.*?\)', '', message) + message = re.sub(r'\bLine \d+\b', '', message) + message = re.sub(r'\bmemory address 0x[\da-f]+\b', '', message, flags=re.I) + return message.strip() + + +def __write_ranked_exceptions(ranked_exceptions, output_file_path): + current_rank = 1 + last_count = None + rank_increment = 0 # keeps track of how many ranks we should jump after ties + + with open(output_file_path, 'w') as output_file: + for count, examples, name, message in ranked_exceptions: + if last_count != count: + current_rank += rank_increment + rank_increment = 1 # reset for next potential tie group + else: + rank_increment += 1 # increment to account for a tie when next different count comes + + last_count = count + output_line = f""" +Rank: {current_rank}, +Count: {count}, +Exception: {name}, +Message: {message}, +Example: {examples[0]} + +""" + output_file.write(output_line) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: python3 specimin_exception_rank.py ") + sys.exit(1) + analyze_log(sys.argv[1]) diff --git a/ashe_scripts/specimin_statistics.py b/ashe_scripts/specimin_statistics.py new file mode 100644 index 0000000..07414c7 --- /dev/null +++ b/ashe_scripts/specimin_statistics.py @@ -0,0 +1,137 @@ +""" +Script for analyzing log files generated by ASHE in dryrun mode. +https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints + +Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips +Date: April 13, 2024 + +Description: +This script reads a log file and computes attempted, successful, and failed Specimin minimization +and compilation statistics. These statistics come from running the Ashe.RepositoryAutomationEngine +in dryrun mode. + +Output: +Summary written to a txt file in the same directory as the provided log file. + +Usage: +python3 specimin_statistics.py +""" + +import sys +import os +import re + + +def analyze_log(file_path: str): + directory: str = os.path.dirname(file_path) + output_file_path: str = os.path.join(directory, 'specimin_statistics.txt') + + with open(output_file_path, 'w') as output_file: + with open(file_path, 'r') as file: + lines: list[str] = file.readlines() + + repo_stats: dict[str, int] = { + 'minimization_attempts': 0, + 'successful_minimization': 0, + 'failed_minimization': 0, + 'compilation_attempts': 0, + 'successful_compilation': 0, + 'failed_compilation': 0, + 'full_success': 0 + } + repo_path: str = "" + branch_name: str = "" + + for line in lines: + line: str = line.strip() + + # get the repository path and branch name from the log line + if "Processing repository at:" in line: + # if Ashe Repository Automation Engine finished processing a repository + # and moved on to the next repository, print and reset the statistics + if repo_path: + __print_and_write_stats(repo_stats, repo_path, branch_name, output_file) + repo_stats = repo_stats.fromkeys(repo_stats, 0) + + repo_path, branch_name = __extract_repo_and_branch(line) + + __update_stats(line, repo_stats) + + if "Completed processing repository at:" in line: + __print_and_write_stats(repo_stats, repo_path, branch_name, output_file) + repo_stats = repo_stats.fromkeys(repo_stats, 0) # reset statistics for new repo + print("Write successful") + + +def __update_stats(line, repo_stats): + if "Minimizing source file..." in line: + repo_stats['minimization_attempts'] += 1 + if "BUILD SUCCESSFUL" in line: + repo_stats['successful_minimization'] += 1 + if "BUILD FAILED" in line: + repo_stats['failed_minimization'] += 1 + if "Compiling Java files" in line: + repo_stats['compilation_attempts'] += 1 + if "Minimized files compiled successfully." in line: + repo_stats['successful_compilation'] += 1 + repo_stats['full_success'] += 1 + if "Minimized files failed to compile." in line: + repo_stats['failed_compilation'] += 1 + + +def __print_and_write_stats(stats, repo_path, branch_name, output_file): + successful_min_percent = (stats['successful_minimization'] / stats['minimization_attempts'] * 100) if stats[ + 'minimization_attempts'] else 0 + failed_min_percent = (stats['failed_minimization'] / stats['minimization_attempts'] * 100) if stats[ + 'minimization_attempts'] else 0 + successful_comp_percent = (stats['successful_compilation'] / stats['compilation_attempts'] * 100) if stats[ + 'compilation_attempts'] else 0 + failed_comp_percent = (stats['failed_compilation'] / stats['compilation_attempts'] * 100) if stats[ + 'compilation_attempts'] else 0 + full_success_percent = (stats['full_success'] / stats['minimization_attempts'] * 100) if stats[ + 'minimization_attempts'] else 0 + + output_content = f""" +Running Specimin on repository: {repo_path} for branch: {branch_name} +Attempted minimization - {stats['minimization_attempts']}: +Successfully minimized {stats['successful_minimization']} ({successful_min_percent:.2f}%) target methods. +Failed to minimize {stats['failed_minimization']} ({failed_min_percent:.2f}%) target methods. + +Attempted compilation - {stats['compilation_attempts']}: +Successful: {stats['successful_compilation']} ({successful_comp_percent:.2f}%) +Failed: {stats['failed_compilation']} ({failed_comp_percent:.2f}%) + +Fully successful from minimization to compilation: {stats['full_success']} ({full_success_percent:.2f}%) + +""" + output_file.write(output_content) + + +def __extract_repo_and_branch(log_line: str): + """ + Extracts the repository path and branch name from a log line. + + Parameters: + - log_line (str): A string from the log file containing repository and branch information. + + Returns: + - tuple: A tuple containing the repository path and the branch name. + """ + # regex pattern to find the repository path and branch name + pattern = r"Processing repository at: (.+?) for branch: (.+)" + match = re.search(pattern, log_line) + + if match: + repo_path = match.group(1).strip() + branch_name = match.group(2).strip() + return repo_path, branch_name + else: + return "", "" + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: python3 specimin_statistics.py ") + sys.exit(1) + log_file_path = sys.argv[1] + analyze_log(log_file_path)