Skip to content

Commit

Permalink
Create pie_most_buggy_componenet.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yunwei37 authored Sep 22, 2024
1 parent 84ecfb0 commit ca3019b
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions analysis/bpf/pie_most_buggy_componenet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd

# Load the CSV file for analysis
file_path = '/mnt/data/commit_survey (19).csv'
data = pd.read_csv(file_path)

# Display the first few rows to understand its structure
data_head = data.head()

# Filter out commits related to "bug" or "fix" in the commit classification
buggy_commits = data[data['commit_classification'].str.contains("bug|fix", case=False, na=False)]

# Exclude irrelevant components
excluded_components = [
"The libbpf library. It affects the library that simplifies interaction with eBPF from user-space applications.",
"The test cases and makefiles. It adds or modifies test cases or makefile scripts used for testing or building eBPF programs.",
"It's not related to any above. It affects an implementation component not listed but does related to the BPF subsystem.",
"It's not related to any above. It affects an implementation component is totally unrelated to the BPF subsystem.",
"The bpftool utility. It modifies the bpftool utility used for introspecting and interacting with eBPF programs and maps."
]

# Filter the dataset for relevant buggy components
filtered_buggy_components = buggy_commits[~buggy_commits['major_related_implementation_component'].isin(excluded_components)]

# Drop entries that are unrelated
filtered_buggy_components_cleaned = filtered_buggy_components[~filtered_buggy_components['major_related_implementation_component'].str.contains("not related", case=False, na=False)]

# Mapping long labels to short summaries
label_replacements = {
"The implementation happens in other subsystem and is related to eBPF events. e.g. probes perf events tracepoints network scheduler HID LSM etc. Note it's still related to how eBPF programs interact with these events.": "eBPF events (tracepoints, perf, etc.)",
"The eBPF verifier. This component ensures that eBPF programs are safe to run within the kernel.": "eBPF verifier",
"The eBPF maps. It changes how data structures shared between user-space and kernel-space (maps) are created or managed.": "eBPF maps",
"The eBPF JIT compiler for different architectures. It changes how eBPF bytecode is translated into machine code for different hardware architectures.": "eBPF JIT compiler",
"The helper and kfuncs. It modifies or adds helpers and kernel functions that eBPF programs can call.": "eBPF helpers and kfuncs",
"The syscall interface. It modifies or adds code related to system calls between user-space and kernel-space.": "Syscall interface"
}

# Apply these replacements to the component column
filtered_buggy_components_cleaned['short_component'] = filtered_buggy_components_cleaned['major_related_implementation_component'].replace(label_replacements)

# Group other components under 'Others'
main_components = ["eBPF events (tracepoints, perf, etc.)", "eBPF verifier", "eBPF maps", "eBPF JIT compiler", "eBPF helpers and kfuncs", "Syscall interface"]
filtered_buggy_components_cleaned['short_component'] = filtered_buggy_components_cleaned['short_component'].apply(lambda x: x if x in main_components else "Others")

# Extract file paths and count the occurrences
def extract_valid_file_paths(changed_files_entry):
file_paths = re.findall(r'([a-zA-Z0-9_/.-]+\.[ch])', changed_files_entry)
return file_paths if file_paths else []

all_valid_changed_files = filtered_buggy_components_cleaned['changed_files'].dropna().apply(extract_valid_file_paths).explode()

# Top 10 files with the most bug fixes
top_valid_buggy_files = all_valid_changed_files.value_counts().head(10)

# Count number of files changed in each commit
filtered_buggy_components_cleaned['num_files_changed'] = filtered_buggy_components_cleaned['changed_files'].dropna().apply(lambda x: len(extract_valid_file_paths(x)))

# Distribution of commits by number of files changed
multiple_files_changed = filtered_buggy_components_cleaned['num_files_changed'].value_counts()

# Display the first few rows, top 10 files, and distribution of file changes
data_head, top_valid_buggy_files, multiple_files_changed

0 comments on commit ca3019b

Please sign in to comment.