Merge pull request #49 from jonathan-m-phillips/ashe-specimin-scripts

Ashe specimin scripts
tahiat · Apr 16, 2024 · b9072aa · b9072aa
2 parents 8252312 + 4d9446b
commit b9072aa
Show file tree

Hide file tree

Showing 5 changed files with 383 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,7 @@ checker-framework*
 jdk*
 
 .vscode/
+
+.idea
+
+*config.properties
diff --git a/ashe_scripts/README.MD b/ashe_scripts/README.MD
@@ -0,0 +1,10 @@
+# Specimin Statistics and Exception Ranking
+
+### specimin_statistics.py
+The script to parse the ASHE log files and generate statistical data from Specimin's minimization process.
+
+### specimin_exception_ranking.py
+The script to parse the ASHE log files and generate a ranking of the exceptions that occurred during the minimization process.
+
+### run_ashe_for_stats.py
+The script that clones ASHE, builds and runs it, and then runs the specimin_statistics.py and specimin_exception_rank.py scripts.
diff --git a/ashe_scripts/run_ashe_for_stats.py b/ashe_scripts/run_ashe_for_stats.py
@@ -0,0 +1,107 @@
+"""
+Script to run Ashe.RepositoryAutomationEngine and Specimin scripts to analyze the log file generated by ASHE in dryrun mode.
+https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints
+
+Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips
+Date: April 13, 2024
+
+Usage:
+python3 run_ashe_for_stats.py <path_to_clone_ashe> <path_to_csv> <path_to_clone_csv_repositories> <path_to_config.properties>
+"""
+import subprocess
+import sys
+import threading
+import datetime
+import time
+import os
+
+
+def run(ashe_path: str, csv_path: str, clone_path: str, props_file_path: str):
+    """
+    Run ASHE and Specimin scripts to analyze the log file.
+    Args:
+        ashe_path: absolute path to clone the ASHE repository
+        csv_path: absolute path to the CSV file containing the repositories ASHE will iterate over
+        clone_path: absolute path to clone the repositories in the CSV file ASHE will iterate over
+        props_file_path: absolute path to the directory containing the config.properties files for ASHE
+    """
+
+    ashe_url: str = "https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints"
+    # clone or update repository
+    __git_clone_or_update(ashe_url, ashe_path)
+
+    start_time: datetime = datetime.datetime.now()
+    status_thread: threading.Thread = threading.Thread(target=__print_ashe_runtime, args=(start_time,))
+    status_thread.daemon = True
+    status_thread.start()
+    __build_and_run_ashe(csv_path, clone_path, props_file_path, working_dir=ashe_path)
+
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    stats_script = os.path.join(current_dir, 'specimin_statistics.py')
+    rank_script = os.path.join(current_dir, 'specimin_exception_rank.py')
+
+    # run Specimin scripts
+    log_path: str = os.path.join(ashe_path, "logs", "app.log")
+    print("Running statistics script...")
+    __run_command(f"python3 {stats_script} {log_path}")
+
+    print("Running exception rank script...")
+    __run_command(f"python3 {rank_script} {log_path}")
+
+
+def __run_command(command, working_dir=None):
+    try:
+        result = subprocess.run(command, cwd=working_dir, shell=True, check=True, stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        print(result.stdout.decode())
+    except subprocess.CalledProcessError as e:
+        print("Error executing command:", e.stderr.decode())
+
+
+def __git_clone_or_update(repo_url, ashe_path):
+    """Clone or update the git repository."""
+    if not os.path.exists(ashe_path):
+        print("Cloning the repository...")
+        __run_command(f"git clone {repo_url} {ashe_path}")
+    else:
+        print("Repository exists. Checking if it's a Git repository...")
+        if not os.path.exists(os.path.join(ashe_path, '.git')):
+            print(f"The directory {ashe_path} is not a Git repository.")
+            __run_command(f"git clone {repo_url} {ashe_path}")
+        else:
+            print("Updating the repository...")
+            os.chdir(ashe_path)
+            __run_command("git pull")
+
+
+def __build_and_run_ashe(csv_path: str, clone_path: str, props_file_path: str, working_dir: str):
+    """Build and run the ASHE project using gradle."""
+    # build ASHE
+    build_command: str = './gradlew build'
+    model_type: str = "dryrun"
+    run_automation_command: str = f"./gradlew runRepositoryAutomation -PrepositoriesCsvPath=\"{csv_path}\" -PcloneDirectory=\"{clone_path}\" -Pllm=\"{model_type}\" -PpropsFilePath=\"{props_file_path}\""
+
+    print("Building ASHE...")
+    __run_command(build_command, working_dir=working_dir)
+
+    print("Running ASHE...")
+    __run_command(run_automation_command, working_dir=working_dir)
+
+
+def __print_ashe_runtime(start_time):
+    """Function to print the elapsed time since ASHE started."""
+    print("ASHE started.")
+    print("ASHE runtime: 00:00:00")
+    while True:
+        time.sleep(300)  # sleep for 5 minute
+        elapsed_time = datetime.datetime.now() - start_time
+        # format elapsed time into H:M:S
+        formatted_time = str(elapsed_time).split('.')[0]  # remove microseconds
+        print(f"ASHE runtime: {formatted_time}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print("Usage: python3 run_ashe_for_stats.py <path_to_clone_ashe> <path_to_csv> <path_to_clone_csv_repositories> <path_to_config.properties>")
+        sys.exit(1)
+    run(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
diff --git a/ashe_scripts/specimin_exception_rank.py b/ashe_scripts/specimin_exception_rank.py
@@ -0,0 +1,125 @@
+"""
+Script for analyzing log files generated by ASHE in dryrun mode.
+https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints
+
+Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips
+Date: April 13, 2024
+
+Description:
+This script reads a log file and ranks the exceptions by how frequently they occur. If the exceptions
+occur more often, they are ranked higher. These exception rankings come from running the
+Ashe.RepositoryAutomationEngine in dryrun mode.
+
+Output:
+Rankings written to a txt file in the same directory as the provided log file.
+
+Usage:
+python3 specimin_exception_rank.py <path_to_log_file.log>
+"""
+
+import sys
+import os
+import re
+from collections import defaultdict
+
+
+def analyze_log(file_path: str):
+    directory = os.path.dirname(file_path)
+    output_file_path = os.path.join(directory, 'specimin_exception_rank.txt')
+
+    with open(file_path, 'r') as file:
+        content = file.readlines()
+
+    exceptions = __extract_exceptions(content)
+    ranked_exceptions = __rank_exceptions(exceptions)
+
+    __write_ranked_exceptions(ranked_exceptions, output_file_path)
+    print("Write successful")
+
+
+def __extract_exceptions(log_lines):
+    """
+    Extract exceptions from the log lines. An exception is defined as a line that starts with "Exception in thread"
+    Args:
+        log_lines: A list of log lines
+
+    Returns: A list of tuples (name, message, example_line)
+    """
+    # Enhanced to capture an example line following the exception message
+    exception_pattern = re.compile(r'^Exception in thread ".*?" (\w+.*?):(.*?)(?=\n\S|\Z)', re.DOTALL)
+    context_pattern = re.compile(r'^\s+at (.+)$', re.MULTILINE)
+    exceptions = []
+    for i, line in enumerate(log_lines):
+        match = exception_pattern.search(line)
+        if match:
+            exception_name, message = match.groups()
+            # find the next line that starts with whitespace followed by "at" to capture the context
+            context_match = context_pattern.search(log_lines[i + 1] if i + 1 < len(log_lines) else "")
+            example_line = context_match.group(1).strip() if context_match else "No code context available"
+            exceptions.append([exception_name.strip(), message.strip(), example_line])
+    return exceptions
+
+
+def __rank_exceptions(exceptions):
+    """
+    Rank the exceptions by how frequently they occur. If the exceptions occur more often, they are ranked higher.
+    Args:
+        exceptions: A list of tuples (name, message, example_line)
+
+    Returns: A sorted list of tuples (count, examples, name, message)
+    """
+    grouped_exceptions = defaultdict(list)
+    for name, message, example in exceptions:
+        simplified_message = simplify_message(message)
+        grouped_exceptions[(name, simplified_message)].append(example)
+
+    # convert grouped data into a sorted list of tuples (count, examples, name, message)
+    sorted_exceptions = sorted(((len(v), v, k[0], k[1]) for k, v in grouped_exceptions.items()), reverse=True,
+                               key=lambda x: x[0])
+    return sorted_exceptions
+
+
+def simplify_message(message):
+    """
+    Simplify the exception message by removing certain patterns that are not helpful for distinguishing exceptions.
+    Args:
+        message: The exception message for Specimin developers to analyze
+
+    Returns: A simplified version of the message
+    """
+    message = re.sub(r'\bat [\w\.$<>]+\(.*?\)', '', message)
+    message = re.sub(r'\bLine \d+\b', '', message)
+    message = re.sub(r'\bmemory address 0x[\da-f]+\b', '', message, flags=re.I)
+    return message.strip()
+
+
+def __write_ranked_exceptions(ranked_exceptions, output_file_path):
+    current_rank = 1
+    last_count = None
+    rank_increment = 0  # keeps track of how many ranks we should jump after ties
+
+    with open(output_file_path, 'w') as output_file:
+        for count, examples, name, message in ranked_exceptions:
+            if last_count != count:
+                current_rank += rank_increment
+                rank_increment = 1  # reset for next potential tie group
+            else:
+                rank_increment += 1  # increment to account for a tie when next different count comes
+
+            last_count = count
+            output_line = f"""
+Rank: {current_rank},
+Count: {count},
+Exception: {name},
+Message: {message},
+Example: {examples[0]}
+
+"""
+            output_file.write(output_line)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: python3 specimin_exception_rank.py <path_to_log_file.log>")
+        sys.exit(1)
+    analyze_log(sys.argv[1])
diff --git a/ashe_scripts/specimin_statistics.py b/ashe_scripts/specimin_statistics.py
@@ -0,0 +1,137 @@
+"""
+Script for analyzing log files generated by ASHE in dryrun mode.
+https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints
+
+Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips
+Date: April 13, 2024
+
+Description:
+This script reads a log file and computes attempted, successful, and failed Specimin minimization
+and compilation statistics. These statistics come from running the Ashe.RepositoryAutomationEngine
+in dryrun mode.
+
+Output:
+Summary written to a txt file in the same directory as the provided log file.
+
+Usage:
+python3 specimin_statistics.py <path_to_log_file.log>
+"""
+
+import sys
+import os
+import re
+
+
+def analyze_log(file_path: str):
+    directory: str = os.path.dirname(file_path)
+    output_file_path: str = os.path.join(directory, 'specimin_statistics.txt')
+
+    with open(output_file_path, 'w') as output_file:
+        with open(file_path, 'r') as file:
+            lines: list[str] = file.readlines()
+
+        repo_stats: dict[str, int] = {
+            'minimization_attempts': 0,
+            'successful_minimization': 0,
+            'failed_minimization': 0,
+            'compilation_attempts': 0,
+            'successful_compilation': 0,
+            'failed_compilation': 0,
+            'full_success': 0
+        }
+        repo_path: str = ""
+        branch_name: str = ""
+
+        for line in lines:
+            line: str = line.strip()
+
+            # get the repository path and branch name from the log line
+            if "Processing repository at:" in line:
+                # if Ashe Repository Automation Engine finished processing a repository
+                # and moved on to the next repository, print and reset the statistics
+                if repo_path:
+                    __print_and_write_stats(repo_stats, repo_path, branch_name, output_file)
+                    repo_stats = repo_stats.fromkeys(repo_stats, 0)
+
+                repo_path, branch_name = __extract_repo_and_branch(line)
+
+            __update_stats(line, repo_stats)
+
+            if "Completed processing repository at:" in line:
+                __print_and_write_stats(repo_stats, repo_path, branch_name, output_file)
+                repo_stats = repo_stats.fromkeys(repo_stats, 0)  # reset statistics for new repo
+    print("Write successful")
+
+
+def __update_stats(line, repo_stats):
+    if "Minimizing source file..." in line:
+        repo_stats['minimization_attempts'] += 1
+    if "BUILD SUCCESSFUL" in line:
+        repo_stats['successful_minimization'] += 1
+    if "BUILD FAILED" in line:
+        repo_stats['failed_minimization'] += 1
+    if "Compiling Java files" in line:
+        repo_stats['compilation_attempts'] += 1
+    if "Minimized files compiled successfully." in line:
+        repo_stats['successful_compilation'] += 1
+        repo_stats['full_success'] += 1
+    if "Minimized files failed to compile." in line:
+        repo_stats['failed_compilation'] += 1
+
+
+def __print_and_write_stats(stats, repo_path, branch_name, output_file):
+    successful_min_percent = (stats['successful_minimization'] / stats['minimization_attempts'] * 100) if stats[
+        'minimization_attempts'] else 0
+    failed_min_percent = (stats['failed_minimization'] / stats['minimization_attempts'] * 100) if stats[
+        'minimization_attempts'] else 0
+    successful_comp_percent = (stats['successful_compilation'] / stats['compilation_attempts'] * 100) if stats[
+        'compilation_attempts'] else 0
+    failed_comp_percent = (stats['failed_compilation'] / stats['compilation_attempts'] * 100) if stats[
+        'compilation_attempts'] else 0
+    full_success_percent = (stats['full_success'] / stats['minimization_attempts'] * 100) if stats[
+        'minimization_attempts'] else 0
+
+    output_content = f"""
+Running Specimin on repository: {repo_path} for branch: {branch_name}
+Attempted minimization - {stats['minimization_attempts']}:
+Successfully minimized {stats['successful_minimization']} ({successful_min_percent:.2f}%) target methods.
+Failed to minimize {stats['failed_minimization']} ({failed_min_percent:.2f}%) target methods.
+
+Attempted compilation - {stats['compilation_attempts']}:
+Successful: {stats['successful_compilation']} ({successful_comp_percent:.2f}%)
+Failed: {stats['failed_compilation']} ({failed_comp_percent:.2f}%)
+
+Fully successful from minimization to compilation: {stats['full_success']} ({full_success_percent:.2f}%)
+
+"""
+    output_file.write(output_content)
+
+
+def __extract_repo_and_branch(log_line: str):
+    """
+    Extracts the repository path and branch name from a log line.
+
+    Parameters:
+    - log_line (str): A string from the log file containing repository and branch information.
+
+    Returns:
+    - tuple: A tuple containing the repository path and the branch name.
+    """
+    # regex pattern to find the repository path and branch name
+    pattern = r"Processing repository at: (.+?) for branch: (.+)"
+    match = re.search(pattern, log_line)
+
+    if match:
+        repo_path = match.group(1).strip()
+        branch_name = match.group(2).strip()
+        return repo_path, branch_name
+    else:
+        return "", ""
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: python3 specimin_statistics.py <path_to_log_file.log>")
+        sys.exit(1)
+    log_file_path = sys.argv[1]
+    analyze_log(log_file_path)
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,7 @@ checker-framework* @@
     jdk*
     .vscode/
+    .idea
+    *config.properties