-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #49 from jonathan-m-phillips/ashe-specimin-scripts
Ashe specimin scripts
- Loading branch information
Showing
5 changed files
with
383 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,3 +16,7 @@ checker-framework* | |
jdk* | ||
|
||
.vscode/ | ||
|
||
.idea | ||
|
||
*config.properties |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Specimin Statistics and Exception Ranking | ||
|
||
### specimin_statistics.py | ||
The script to parse the ASHE log files and generate statistical data from Specimin's minimization process. | ||
|
||
### specimin_exception_ranking.py | ||
The script to parse the ASHE log files and generate a ranking of the exceptions that occurred during the minimization process. | ||
|
||
### run_ashe_for_stats.py | ||
The script that clones ASHE, builds and runs it, and then runs the specimin_statistics.py and specimin_exception_rank.py scripts. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
""" | ||
Script to run Ashe.RepositoryAutomationEngine and Specimin scripts to analyze the log file generated by ASHE in dryrun mode. | ||
https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints | ||
Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips | ||
Date: April 13, 2024 | ||
Usage: | ||
python3 run_ashe_for_stats.py <path_to_clone_ashe> <path_to_csv> <path_to_clone_csv_repositories> <path_to_config.properties> | ||
""" | ||
import subprocess | ||
import sys | ||
import threading | ||
import datetime | ||
import time | ||
import os | ||
|
||
|
||
def run(ashe_path: str, csv_path: str, clone_path: str, props_file_path: str): | ||
""" | ||
Run ASHE and Specimin scripts to analyze the log file. | ||
Args: | ||
ashe_path: absolute path to clone the ASHE repository | ||
csv_path: absolute path to the CSV file containing the repositories ASHE will iterate over | ||
clone_path: absolute path to clone the repositories in the CSV file ASHE will iterate over | ||
props_file_path: absolute path to the directory containing the config.properties files for ASHE | ||
""" | ||
|
||
ashe_url: str = "https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints" | ||
# clone or update repository | ||
__git_clone_or_update(ashe_url, ashe_path) | ||
|
||
start_time: datetime = datetime.datetime.now() | ||
status_thread: threading.Thread = threading.Thread(target=__print_ashe_runtime, args=(start_time,)) | ||
status_thread.daemon = True | ||
status_thread.start() | ||
__build_and_run_ashe(csv_path, clone_path, props_file_path, working_dir=ashe_path) | ||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
stats_script = os.path.join(current_dir, 'specimin_statistics.py') | ||
rank_script = os.path.join(current_dir, 'specimin_exception_rank.py') | ||
|
||
# run Specimin scripts | ||
log_path: str = os.path.join(ashe_path, "logs", "app.log") | ||
print("Running statistics script...") | ||
__run_command(f"python3 {stats_script} {log_path}") | ||
|
||
print("Running exception rank script...") | ||
__run_command(f"python3 {rank_script} {log_path}") | ||
|
||
|
||
def __run_command(command, working_dir=None): | ||
try: | ||
result = subprocess.run(command, cwd=working_dir, shell=True, check=True, stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE) | ||
print(result.stdout.decode()) | ||
except subprocess.CalledProcessError as e: | ||
print("Error executing command:", e.stderr.decode()) | ||
|
||
|
||
def __git_clone_or_update(repo_url, ashe_path): | ||
"""Clone or update the git repository.""" | ||
if not os.path.exists(ashe_path): | ||
print("Cloning the repository...") | ||
__run_command(f"git clone {repo_url} {ashe_path}") | ||
else: | ||
print("Repository exists. Checking if it's a Git repository...") | ||
if not os.path.exists(os.path.join(ashe_path, '.git')): | ||
print(f"The directory {ashe_path} is not a Git repository.") | ||
__run_command(f"git clone {repo_url} {ashe_path}") | ||
else: | ||
print("Updating the repository...") | ||
os.chdir(ashe_path) | ||
__run_command("git pull") | ||
|
||
|
||
def __build_and_run_ashe(csv_path: str, clone_path: str, props_file_path: str, working_dir: str): | ||
"""Build and run the ASHE project using gradle.""" | ||
# build ASHE | ||
build_command: str = './gradlew build' | ||
model_type: str = "dryrun" | ||
run_automation_command: str = f"./gradlew runRepositoryAutomation -PrepositoriesCsvPath=\"{csv_path}\" -PcloneDirectory=\"{clone_path}\" -Pllm=\"{model_type}\" -PpropsFilePath=\"{props_file_path}\"" | ||
|
||
print("Building ASHE...") | ||
__run_command(build_command, working_dir=working_dir) | ||
|
||
print("Running ASHE...") | ||
__run_command(run_automation_command, working_dir=working_dir) | ||
|
||
|
||
def __print_ashe_runtime(start_time): | ||
"""Function to print the elapsed time since ASHE started.""" | ||
print("ASHE started.") | ||
print("ASHE runtime: 00:00:00") | ||
while True: | ||
time.sleep(300) # sleep for 5 minute | ||
elapsed_time = datetime.datetime.now() - start_time | ||
# format elapsed time into H:M:S | ||
formatted_time = str(elapsed_time).split('.')[0] # remove microseconds | ||
print(f"ASHE runtime: {formatted_time}") | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) < 4: | ||
print("Usage: python3 run_ashe_for_stats.py <path_to_clone_ashe> <path_to_csv> <path_to_clone_csv_repositories> <path_to_config.properties>") | ||
sys.exit(1) | ||
run(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
""" | ||
Script for analyzing log files generated by ASHE in dryrun mode. | ||
https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints | ||
Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips | ||
Date: April 13, 2024 | ||
Description: | ||
This script reads a log file and ranks the exceptions by how frequently they occur. If the exceptions | ||
occur more often, they are ranked higher. These exception rankings come from running the | ||
Ashe.RepositoryAutomationEngine in dryrun mode. | ||
Output: | ||
Rankings written to a txt file in the same directory as the provided log file. | ||
Usage: | ||
python3 specimin_exception_rank.py <path_to_log_file.log> | ||
""" | ||
|
||
import sys | ||
import os | ||
import re | ||
from collections import defaultdict | ||
|
||
|
||
def analyze_log(file_path: str): | ||
directory = os.path.dirname(file_path) | ||
output_file_path = os.path.join(directory, 'specimin_exception_rank.txt') | ||
|
||
with open(file_path, 'r') as file: | ||
content = file.readlines() | ||
|
||
exceptions = __extract_exceptions(content) | ||
ranked_exceptions = __rank_exceptions(exceptions) | ||
|
||
__write_ranked_exceptions(ranked_exceptions, output_file_path) | ||
print("Write successful") | ||
|
||
|
||
def __extract_exceptions(log_lines): | ||
""" | ||
Extract exceptions from the log lines. An exception is defined as a line that starts with "Exception in thread" | ||
Args: | ||
log_lines: A list of log lines | ||
Returns: A list of tuples (name, message, example_line) | ||
""" | ||
# Enhanced to capture an example line following the exception message | ||
exception_pattern = re.compile(r'^Exception in thread ".*?" (\w+.*?):(.*?)(?=\n\S|\Z)', re.DOTALL) | ||
context_pattern = re.compile(r'^\s+at (.+)$', re.MULTILINE) | ||
exceptions = [] | ||
for i, line in enumerate(log_lines): | ||
match = exception_pattern.search(line) | ||
if match: | ||
exception_name, message = match.groups() | ||
# find the next line that starts with whitespace followed by "at" to capture the context | ||
context_match = context_pattern.search(log_lines[i + 1] if i + 1 < len(log_lines) else "") | ||
example_line = context_match.group(1).strip() if context_match else "No code context available" | ||
exceptions.append([exception_name.strip(), message.strip(), example_line]) | ||
return exceptions | ||
|
||
|
||
def __rank_exceptions(exceptions): | ||
""" | ||
Rank the exceptions by how frequently they occur. If the exceptions occur more often, they are ranked higher. | ||
Args: | ||
exceptions: A list of tuples (name, message, example_line) | ||
Returns: A sorted list of tuples (count, examples, name, message) | ||
""" | ||
grouped_exceptions = defaultdict(list) | ||
for name, message, example in exceptions: | ||
simplified_message = simplify_message(message) | ||
grouped_exceptions[(name, simplified_message)].append(example) | ||
|
||
# convert grouped data into a sorted list of tuples (count, examples, name, message) | ||
sorted_exceptions = sorted(((len(v), v, k[0], k[1]) for k, v in grouped_exceptions.items()), reverse=True, | ||
key=lambda x: x[0]) | ||
return sorted_exceptions | ||
|
||
|
||
def simplify_message(message): | ||
""" | ||
Simplify the exception message by removing certain patterns that are not helpful for distinguishing exceptions. | ||
Args: | ||
message: The exception message for Specimin developers to analyze | ||
Returns: A simplified version of the message | ||
""" | ||
message = re.sub(r'\bat [\w\.$<>]+\(.*?\)', '', message) | ||
message = re.sub(r'\bLine \d+\b', '', message) | ||
message = re.sub(r'\bmemory address 0x[\da-f]+\b', '', message, flags=re.I) | ||
return message.strip() | ||
|
||
|
||
def __write_ranked_exceptions(ranked_exceptions, output_file_path): | ||
current_rank = 1 | ||
last_count = None | ||
rank_increment = 0 # keeps track of how many ranks we should jump after ties | ||
|
||
with open(output_file_path, 'w') as output_file: | ||
for count, examples, name, message in ranked_exceptions: | ||
if last_count != count: | ||
current_rank += rank_increment | ||
rank_increment = 1 # reset for next potential tie group | ||
else: | ||
rank_increment += 1 # increment to account for a tie when next different count comes | ||
|
||
last_count = count | ||
output_line = f""" | ||
Rank: {current_rank}, | ||
Count: {count}, | ||
Exception: {name}, | ||
Message: {message}, | ||
Example: {examples[0]} | ||
""" | ||
output_file.write(output_line) | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) < 2: | ||
print("Usage: python3 specimin_exception_rank.py <path_to_log_file.log>") | ||
sys.exit(1) | ||
analyze_log(sys.argv[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
""" | ||
Script for analyzing log files generated by ASHE in dryrun mode. | ||
https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints | ||
Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips | ||
Date: April 13, 2024 | ||
Description: | ||
This script reads a log file and computes attempted, successful, and failed Specimin minimization | ||
and compilation statistics. These statistics come from running the Ashe.RepositoryAutomationEngine | ||
in dryrun mode. | ||
Output: | ||
Summary written to a txt file in the same directory as the provided log file. | ||
Usage: | ||
python3 specimin_statistics.py <path_to_log_file.log> | ||
""" | ||
|
||
import sys | ||
import os | ||
import re | ||
|
||
|
||
def analyze_log(file_path: str): | ||
directory: str = os.path.dirname(file_path) | ||
output_file_path: str = os.path.join(directory, 'specimin_statistics.txt') | ||
|
||
with open(output_file_path, 'w') as output_file: | ||
with open(file_path, 'r') as file: | ||
lines: list[str] = file.readlines() | ||
|
||
repo_stats: dict[str, int] = { | ||
'minimization_attempts': 0, | ||
'successful_minimization': 0, | ||
'failed_minimization': 0, | ||
'compilation_attempts': 0, | ||
'successful_compilation': 0, | ||
'failed_compilation': 0, | ||
'full_success': 0 | ||
} | ||
repo_path: str = "" | ||
branch_name: str = "" | ||
|
||
for line in lines: | ||
line: str = line.strip() | ||
|
||
# get the repository path and branch name from the log line | ||
if "Processing repository at:" in line: | ||
# if Ashe Repository Automation Engine finished processing a repository | ||
# and moved on to the next repository, print and reset the statistics | ||
if repo_path: | ||
__print_and_write_stats(repo_stats, repo_path, branch_name, output_file) | ||
repo_stats = repo_stats.fromkeys(repo_stats, 0) | ||
|
||
repo_path, branch_name = __extract_repo_and_branch(line) | ||
|
||
__update_stats(line, repo_stats) | ||
|
||
if "Completed processing repository at:" in line: | ||
__print_and_write_stats(repo_stats, repo_path, branch_name, output_file) | ||
repo_stats = repo_stats.fromkeys(repo_stats, 0) # reset statistics for new repo | ||
print("Write successful") | ||
|
||
|
||
def __update_stats(line, repo_stats): | ||
if "Minimizing source file..." in line: | ||
repo_stats['minimization_attempts'] += 1 | ||
if "BUILD SUCCESSFUL" in line: | ||
repo_stats['successful_minimization'] += 1 | ||
if "BUILD FAILED" in line: | ||
repo_stats['failed_minimization'] += 1 | ||
if "Compiling Java files" in line: | ||
repo_stats['compilation_attempts'] += 1 | ||
if "Minimized files compiled successfully." in line: | ||
repo_stats['successful_compilation'] += 1 | ||
repo_stats['full_success'] += 1 | ||
if "Minimized files failed to compile." in line: | ||
repo_stats['failed_compilation'] += 1 | ||
|
||
|
||
def __print_and_write_stats(stats, repo_path, branch_name, output_file): | ||
successful_min_percent = (stats['successful_minimization'] / stats['minimization_attempts'] * 100) if stats[ | ||
'minimization_attempts'] else 0 | ||
failed_min_percent = (stats['failed_minimization'] / stats['minimization_attempts'] * 100) if stats[ | ||
'minimization_attempts'] else 0 | ||
successful_comp_percent = (stats['successful_compilation'] / stats['compilation_attempts'] * 100) if stats[ | ||
'compilation_attempts'] else 0 | ||
failed_comp_percent = (stats['failed_compilation'] / stats['compilation_attempts'] * 100) if stats[ | ||
'compilation_attempts'] else 0 | ||
full_success_percent = (stats['full_success'] / stats['minimization_attempts'] * 100) if stats[ | ||
'minimization_attempts'] else 0 | ||
|
||
output_content = f""" | ||
Running Specimin on repository: {repo_path} for branch: {branch_name} | ||
Attempted minimization - {stats['minimization_attempts']}: | ||
Successfully minimized {stats['successful_minimization']} ({successful_min_percent:.2f}%) target methods. | ||
Failed to minimize {stats['failed_minimization']} ({failed_min_percent:.2f}%) target methods. | ||
Attempted compilation - {stats['compilation_attempts']}: | ||
Successful: {stats['successful_compilation']} ({successful_comp_percent:.2f}%) | ||
Failed: {stats['failed_compilation']} ({failed_comp_percent:.2f}%) | ||
Fully successful from minimization to compilation: {stats['full_success']} ({full_success_percent:.2f}%) | ||
""" | ||
output_file.write(output_content) | ||
|
||
|
||
def __extract_repo_and_branch(log_line: str): | ||
""" | ||
Extracts the repository path and branch name from a log line. | ||
Parameters: | ||
- log_line (str): A string from the log file containing repository and branch information. | ||
Returns: | ||
- tuple: A tuple containing the repository path and the branch name. | ||
""" | ||
# regex pattern to find the repository path and branch name | ||
pattern = r"Processing repository at: (.+?) for branch: (.+)" | ||
match = re.search(pattern, log_line) | ||
|
||
if match: | ||
repo_path = match.group(1).strip() | ||
branch_name = match.group(2).strip() | ||
return repo_path, branch_name | ||
else: | ||
return "", "" | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) < 2: | ||
print("Usage: python3 specimin_statistics.py <path_to_log_file.log>") | ||
sys.exit(1) | ||
log_file_path = sys.argv[1] | ||
analyze_log(log_file_path) |