diff --git a/.github/workflows/check_compilable_percentage.yml b/.github/workflows/check_compilable_percentage.yml new file mode 100644 index 00000000..35d2e7eb --- /dev/null +++ b/.github/workflows/check_compilable_percentage.yml @@ -0,0 +1,132 @@ +name: check_compilable_percentage_CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + specimin-evaluation: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + + - name: Set up environment + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - uses: actions/setup-java@v2 + with: + java-version: '21' + distribution: 'adopt' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + sudo apt-get update + sudo apt-get install -y jq curl bc + + - name: Display CSV File Contents + run: | + CSV_PATH="/home/runner/work/specimin/specimin/CI_repository_list.csv" + if [ -f "$CSV_PATH" ]; then + cat "$CSV_PATH" + else + echo "File $CSV_PATH does not exist" + exit 1 + fi + + - name: Download git-clone-related and dependencies + run: | + curl -L -o git-clone-related https://raw.githubusercontent.com/plume-lib/git-scripts/main/git-clone-related + curl -L -o git-find-fork https://raw.githubusercontent.com/plume-lib/git-scripts/main/git-find-fork + curl -L -o git-find-branch https://raw.githubusercontent.com/plume-lib/git-scripts/main/git-find-branch + chmod +x git-clone-related git-find-fork git-find-branch + + - name: Clone ASHE Project using git-clone-related + run: | + ./git-clone-related njit-jerse ASHE_Automated-Software-Hardening-for-Entrypoints ASHE + + - name: Check and Rename Properties File + run: | + set -ex + CONFIG_PATH="ASHE/src/main/resources/config.properties" + EXAMPLE_PATH="ASHE/src/main/resources/example.properties" + + if [ -f "$CONFIG_PATH" ]; then + echo "config.properties already exists" + elif [ -f "$EXAMPLE_PATH" ]; then + echo "example.properties found, renaming to config.properties" + mv "$EXAMPLE_PATH" "$CONFIG_PATH" + if [ -f "$CONFIG_PATH" ]; then + echo "config.properties created successfully" + else + echo "Failed to create config.properties" + exit 1 + fi + else + echo "Neither config.properties nor example.properties found" + exit 1 + fi + + chmod +w "$CONFIG_PATH" + ls -l "$CONFIG_PATH" + + - name: Update ASHE Config File to update SPECIMIN path + run: | + set -ex + sed -i 's|^specimin.tool.path=.*|specimin.tool.path='$(pwd)'|' ASHE/src/main/resources/config.properties + + - name: Make all scripts under ashe_scripts executable + run: | + set -ex + chmod +x ashe_scripts/*.py + + - name: Run the script + run: | + set -ex + python3 ashe_scripts/run_ashe_for_stats.py \ + $(pwd)/ASHE \ + $(pwd)/CI_repository_list.csv \ + $(pwd)/ASHE/CI_REPO_CLONE_SPACE \ + $(pwd)/ASHE/src/main/resources/config.properties + + - name: Parse accuracy percentage + id: parse_accuracy_percentage + run: | + current_accuracy=$(grep 'Fully successful from minimization to compilation' "$(pwd)/ASHE/logs/specimin_statistics.txt" | awk '{print $NF}' | tr -d '()%') + echo "Current accuracy: $current_accuracy" + echo "::set-output name=current_accuracy::$current_accuracy" + + - name: Read latest run percentage from file + id: read_latest_run_percentage + run: | + if [ -f "$(pwd)/CI_Latest_run_percentage.txt" ]; then + latest_run_accuracy=$(cat "$(pwd)/CI_Latest_run_percentage.txt" | tr -d '()%') + echo "Latest run accuracy: $latest_run_accuracy" + echo "::set-output name=latest_run_accuracy::$latest_run_accuracy" + else + echo "File CI_Latest_run_percentage.txt does not exist" + exit 1 + fi + + - name: Validate accuracy + id: validate_accuracy + run: | + current_accuracy="${{ steps.parse_accuracy_percentage.outputs.current_accuracy }}" + latest_run_accuracy="${{ steps.read_latest_run_percentage.outputs.latest_run_accuracy }}" + + if [ "$current_accuracy" != "$latest_run_accuracy" ]; then + echo "Current accuracy ($current_accuracy) does not match latest run accuracy ($latest_run_accuracy)." + exit 1 + else + echo "Accuracy validation passed." + fi diff --git a/CI_Latest_run_percentage.txt b/CI_Latest_run_percentage.txt new file mode 100644 index 00000000..7601f88f --- /dev/null +++ b/CI_Latest_run_percentage.txt @@ -0,0 +1 @@ +88.61 diff --git a/CI_repository_list.csv b/CI_repository_list.csv new file mode 100644 index 00000000..ae974739 --- /dev/null +++ b/CI_repository_list.csv @@ -0,0 +1,2 @@ +Repository,Branch +https://github.com/NiharikaJamble/plume-util,master diff --git a/ashe_scripts/README.ME b/ashe_scripts/README.ME new file mode 100644 index 00000000..d614f590 --- /dev/null +++ b/ashe_scripts/README.ME @@ -0,0 +1,10 @@ +# Specimin Statistics and Exception Ranking + +### specimin_statistics.py +The script to parse the ASHE log files and generate statistical data from Specimin's minimization process. + +### specimin_exception_ranking.py +The script to parse the ASHE log files and generate a ranking of the exceptions that occurred during the minimization process. + +### run_ashe_for_stats.py +The script that clones ASHE, builds and runs it, and then runs the specimin_statistics.py and specimin_exception_rank.py scripts. diff --git a/ashe_scripts/run_ashe_for_stats.py b/ashe_scripts/run_ashe_for_stats.py new file mode 100644 index 00000000..967cac51 --- /dev/null +++ b/ashe_scripts/run_ashe_for_stats.py @@ -0,0 +1,124 @@ +""" +Script to run Ashe.RepositoryAutomationEngine and Specimin scripts to analyze the log file generated by ASHE in dryrun mode. +https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints + +Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips +Date: April 13, 2024 + +Usage: +python3 run_ashe_for_stats.py +""" +import subprocess +import sys +import threading +import datetime +import time +import os + + +def run(ashe_path: str, csv_path: str, clone_path: str, props_file_path: str): + """ + Run ASHE and Specimin scripts to analyze the log file. + Args: + ashe_path: absolute path to clone the ASHE repository + csv_path: absolute path to the CSV file containing the repositories ASHE will iterate over + clone_path: absolute path to clone the repositories in the CSV file ASHE will iterate over + props_file_path: absolute path to the directory containing the config.properties files for ASHE + """ + + ashe_url: str = "https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints" + # clone or update repository + __git_clone_or_update(ashe_url, ashe_path) + + start_time: datetime = datetime.datetime.now() + status_thread: threading.Thread = threading.Thread(target=__print_ashe_runtime, args=(start_time,)) + status_thread.daemon = True + status_thread.start() + current_dir = os.path.dirname(os.path.abspath(__file__)) + print(f"Current directory path: {current_dir}") + current_dir = current_dir.replace('ASHE/ashe_scripts', 'ashe_scripts') + main_project_dir = os.path.abspath(os.path.join(current_dir, '..', '..')) + stats_script = os.path.join(current_dir, 'specimin_statistics.py') + rank_script = os.path.join(current_dir, 'specimin_exception_rank.py') + print(f"Current directory path after normalising: {current_dir}") + print(f"main project path: {main_project_dir}") + print(f"Statistics script path: {stats_script}") + print(f"Exception rank script path: {rank_script}") + + __build_and_run_ashe(csv_path, clone_path, props_file_path, working_dir=ashe_path) + + current_dir = os.path.dirname(os.path.abspath(__file__)) + print(f"Current directory path: {current_dir}") + current_dir = current_dir.replace('ASHE/ashe_scripts', 'ashe_scripts') + main_project_dir = os.path.abspath(os.path.join(current_dir, '..', '..')) + stats_script = os.path.join(current_dir, 'specimin_statistics.py') + rank_script = os.path.join(current_dir, 'specimin_exception_rank.py') + print(f"Current directory path after normalising: {current_dir}") + print(f"main project path: {main_project_dir}") + print(f"Statistics script path: {stats_script}") + print(f"Exception rank script path: {rank_script}") + # run Specimin scripts + log_path: str = os.path.join(ashe_path, "logs", "app.log") + print("Running statistics script...") + __run_command(f"python3 {stats_script} {log_path}") + + print("Running exception rank script...") + __run_command(f"python3 {rank_script} {log_path}") + + +def __run_command(command, working_dir=None): + try: + result = subprocess.run(command, cwd=working_dir, shell=True, check=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + print(result.stdout.decode()) + except subprocess.CalledProcessError as e: + print("Error executing command:", e.stderr.decode()) + + +def __git_clone_or_update(repo_url, ashe_path): + """Clone or update the git repository.""" + if not os.path.exists(ashe_path): + print("Cloning the repository...") + __run_command(f"git clone {repo_url} {ashe_path}") + else: + print("Repository exists. Checking if it's a Git repository...") + if not os.path.exists(os.path.join(ashe_path, '.git')): + print(f"The directory {ashe_path} is not a Git repository.") + __run_command(f"git clone {repo_url} {ashe_path}") + else: + print("Updating the repository...") + os.chdir(ashe_path) + __run_command("git pull") + + +def __build_and_run_ashe(csv_path: str, clone_path: str, props_file_path: str, working_dir: str): + """Build and run the ASHE project using gradle.""" + # build ASHE + build_command: str = './gradlew build' + model_type: str = "dryrun" + run_automation_command: str = f"./gradlew runRepositoryAutomation -PrepositoriesCsvPath=\"{csv_path}\" -PcloneDirectory=\"{clone_path}\" -Pllm=\"{model_type}\" -PpropsFilePath=\"{props_file_path}\"" + + print("Building ASHE...") + __run_command(build_command, working_dir=working_dir) + + print("Running ASHE...") + __run_command(run_automation_command, working_dir=working_dir) + + +def __print_ashe_runtime(start_time): + """Function to print the elapsed time since ASHE started.""" + print("ASHE started.") + print("ASHE runtime: 00:00:00") + while True: + time.sleep(300) # sleep for 5 minute + elapsed_time = datetime.datetime.now() - start_time + # format elapsed time into H:M:S + formatted_time = str(elapsed_time).split('.')[0] # remove microseconds + print(f"ASHE runtime: {formatted_time}") + + +if __name__ == "__main__": + if len(sys.argv) < 4: + print("Usage: python3 run_ashe_for_stats.py ") + sys.exit(1) + run(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/ashe_scripts/specimin_exception_rank.py b/ashe_scripts/specimin_exception_rank.py new file mode 100644 index 00000000..ff8fcf78 --- /dev/null +++ b/ashe_scripts/specimin_exception_rank.py @@ -0,0 +1,125 @@ +""" +Script for analyzing log files generated by ASHE in dryrun mode. +https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints + +Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips +Date: April 13, 2024 + +Description: +This script reads a log file and ranks the exceptions by how frequently they occur. If the exceptions +occur more often, they are ranked higher. These exception rankings come from running the +Ashe.RepositoryAutomationEngine in dryrun mode. + +Output: +Rankings written to a txt file in the same directory as the provided log file. + +Usage: +python3 specimin_exception_rank.py +""" + +import sys +import os +import re +from collections import defaultdict + + +def analyze_log(file_path: str): + directory = os.path.dirname(file_path) + output_file_path = os.path.join(directory, 'specimin_exception_rank.txt') + + with open(file_path, 'r') as file: + content = file.readlines() + + exceptions = __extract_exceptions(content) + ranked_exceptions = __rank_exceptions(exceptions) + + __write_ranked_exceptions(ranked_exceptions, output_file_path) + print("Write successful") + + +def __extract_exceptions(log_lines): + """ + Extract exceptions from the log lines. An exception is defined as a line that starts with "Exception in thread" + Args: + log_lines: A list of log lines + + Returns: A list of tuples (name, message, example_line) + """ + # Enhanced to capture an example line following the exception message + exception_pattern = re.compile(r'^Exception in thread ".*?" (\w+.*?):(.*?)(?=\n\S|\Z)', re.DOTALL) + context_pattern = re.compile(r'^\s+at (.+)$', re.MULTILINE) + exceptions = [] + for i, line in enumerate(log_lines): + match = exception_pattern.search(line) + if match: + exception_name, message = match.groups() + # find the next line that starts with whitespace followed by "at" to capture the context + context_match = context_pattern.search(log_lines[i + 1] if i + 1 < len(log_lines) else "") + example_line = context_match.group(1).strip() if context_match else "No code context available" + exceptions.append([exception_name.strip(), message.strip(), example_line]) + return exceptions + + +def __rank_exceptions(exceptions): + """ + Rank the exceptions by how frequently they occur. If the exceptions occur more often, they are ranked higher. + Args: + exceptions: A list of tuples (name, message, example_line) + + Returns: A sorted list of tuples (count, examples, name, message) + """ + grouped_exceptions = defaultdict(list) + for name, message, example in exceptions: + simplified_message = simplify_message(message) + grouped_exceptions[(name, simplified_message)].append(example) + + # convert grouped data into a sorted list of tuples (count, examples, name, message) + sorted_exceptions = sorted(((len(v), v, k[0], k[1]) for k, v in grouped_exceptions.items()), reverse=True, + key=lambda x: x[0]) + return sorted_exceptions + + +def simplify_message(message): + """ + Simplify the exception message by removing certain patterns that are not helpful for distinguishing exceptions. + Args: + message: The exception message for Specimin developers to analyze + + Returns: A simplified version of the message + """ + message = re.sub(r'\bat [\w\.$<>]+\(.*?\)', '', message) + message = re.sub(r'\bLine \d+\b', '', message) + message = re.sub(r'\bmemory address 0x[\da-f]+\b', '', message, flags=re.I) + return message.strip() + + +def __write_ranked_exceptions(ranked_exceptions, output_file_path): + current_rank = 1 + last_count = None + rank_increment = 0 # keeps track of how many ranks we should jump after ties + + with open(output_file_path, 'w') as output_file: + for count, examples, name, message in ranked_exceptions: + if last_count != count: + current_rank += rank_increment + rank_increment = 1 # reset for next potential tie group + else: + rank_increment += 1 # increment to account for a tie when next different count comes + + last_count = count + output_line = f""" +Rank: {current_rank}, +Count: {count}, +Exception: {name}, +Message: {message}, +Example: {examples[0]} + +""" + output_file.write(output_line) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: python3 specimin_exception_rank.py ") + sys.exit(1) + analyze_log(sys.argv[1]) diff --git a/ashe_scripts/specimin_statistics.py b/ashe_scripts/specimin_statistics.py new file mode 100644 index 00000000..07414c76 --- /dev/null +++ b/ashe_scripts/specimin_statistics.py @@ -0,0 +1,137 @@ +""" +Script for analyzing log files generated by ASHE in dryrun mode. +https://github.com/jonathan-m-phillips/ASHE_Automated-Software-Hardening-for-Entrypoints + +Created by: Jonathan Phillips, https://github.com/jonathan-m-phillips +Date: April 13, 2024 + +Description: +This script reads a log file and computes attempted, successful, and failed Specimin minimization +and compilation statistics. These statistics come from running the Ashe.RepositoryAutomationEngine +in dryrun mode. + +Output: +Summary written to a txt file in the same directory as the provided log file. + +Usage: +python3 specimin_statistics.py +""" + +import sys +import os +import re + + +def analyze_log(file_path: str): + directory: str = os.path.dirname(file_path) + output_file_path: str = os.path.join(directory, 'specimin_statistics.txt') + + with open(output_file_path, 'w') as output_file: + with open(file_path, 'r') as file: + lines: list[str] = file.readlines() + + repo_stats: dict[str, int] = { + 'minimization_attempts': 0, + 'successful_minimization': 0, + 'failed_minimization': 0, + 'compilation_attempts': 0, + 'successful_compilation': 0, + 'failed_compilation': 0, + 'full_success': 0 + } + repo_path: str = "" + branch_name: str = "" + + for line in lines: + line: str = line.strip() + + # get the repository path and branch name from the log line + if "Processing repository at:" in line: + # if Ashe Repository Automation Engine finished processing a repository + # and moved on to the next repository, print and reset the statistics + if repo_path: + __print_and_write_stats(repo_stats, repo_path, branch_name, output_file) + repo_stats = repo_stats.fromkeys(repo_stats, 0) + + repo_path, branch_name = __extract_repo_and_branch(line) + + __update_stats(line, repo_stats) + + if "Completed processing repository at:" in line: + __print_and_write_stats(repo_stats, repo_path, branch_name, output_file) + repo_stats = repo_stats.fromkeys(repo_stats, 0) # reset statistics for new repo + print("Write successful") + + +def __update_stats(line, repo_stats): + if "Minimizing source file..." in line: + repo_stats['minimization_attempts'] += 1 + if "BUILD SUCCESSFUL" in line: + repo_stats['successful_minimization'] += 1 + if "BUILD FAILED" in line: + repo_stats['failed_minimization'] += 1 + if "Compiling Java files" in line: + repo_stats['compilation_attempts'] += 1 + if "Minimized files compiled successfully." in line: + repo_stats['successful_compilation'] += 1 + repo_stats['full_success'] += 1 + if "Minimized files failed to compile." in line: + repo_stats['failed_compilation'] += 1 + + +def __print_and_write_stats(stats, repo_path, branch_name, output_file): + successful_min_percent = (stats['successful_minimization'] / stats['minimization_attempts'] * 100) if stats[ + 'minimization_attempts'] else 0 + failed_min_percent = (stats['failed_minimization'] / stats['minimization_attempts'] * 100) if stats[ + 'minimization_attempts'] else 0 + successful_comp_percent = (stats['successful_compilation'] / stats['compilation_attempts'] * 100) if stats[ + 'compilation_attempts'] else 0 + failed_comp_percent = (stats['failed_compilation'] / stats['compilation_attempts'] * 100) if stats[ + 'compilation_attempts'] else 0 + full_success_percent = (stats['full_success'] / stats['minimization_attempts'] * 100) if stats[ + 'minimization_attempts'] else 0 + + output_content = f""" +Running Specimin on repository: {repo_path} for branch: {branch_name} +Attempted minimization - {stats['minimization_attempts']}: +Successfully minimized {stats['successful_minimization']} ({successful_min_percent:.2f}%) target methods. +Failed to minimize {stats['failed_minimization']} ({failed_min_percent:.2f}%) target methods. + +Attempted compilation - {stats['compilation_attempts']}: +Successful: {stats['successful_compilation']} ({successful_comp_percent:.2f}%) +Failed: {stats['failed_compilation']} ({failed_comp_percent:.2f}%) + +Fully successful from minimization to compilation: {stats['full_success']} ({full_success_percent:.2f}%) + +""" + output_file.write(output_content) + + +def __extract_repo_and_branch(log_line: str): + """ + Extracts the repository path and branch name from a log line. + + Parameters: + - log_line (str): A string from the log file containing repository and branch information. + + Returns: + - tuple: A tuple containing the repository path and the branch name. + """ + # regex pattern to find the repository path and branch name + pattern = r"Processing repository at: (.+?) for branch: (.+)" + match = re.search(pattern, log_line) + + if match: + repo_path = match.group(1).strip() + branch_name = match.group(2).strip() + return repo_path, branch_name + else: + return "", "" + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: python3 specimin_statistics.py ") + sys.exit(1) + log_file_path = sys.argv[1] + analyze_log(log_file_path)