Skip to content

Commit

Permalink
update imgs
Browse files Browse the repository at this point in the history
  • Loading branch information
yunwei37 committed Sep 23, 2024
1 parent 3a0a65e commit 8eb91a3
Show file tree
Hide file tree
Showing 26 changed files with 95 additions and 57 deletions.
18 changes: 9 additions & 9 deletions analysis/bpf/distribution_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,15 @@ def has_event_related_usecase(usecase_list):
not_related_implementation_percentage = (not_related_implementation_count / total_commits) * 100
not_related_logic_percentage = (not_related_logic_count / total_commits) * 100

# Output the percentage of commits marked as "Not related to BPF subsystem"
print(f"Number of commits marked as 'Not related to BPF subsystem' in Implementation Component: {not_related_implementation_count} ({not_related_implementation_percentage:.2f}% of total commits)")
print(f"Number of commits marked as 'Not related to BPF subsystem' in Logic Component: {not_related_logic_count} ({not_related_logic_percentage:.2f}% of total commits)")

# Check if there's any significant mismatch in these percentages
if not_related_implementation_percentage > 5 or not_related_logic_percentage > 5:
print("\nThere is a significant number of commits marked as 'Not related to BPF subsystem'. This indicates potential data quality issues and suggests that data cleaning may be necessary to improve analysis correctness.")
else:
print("\nThe number of commits marked as 'Not related to BPF subsystem' is relatively low and does not significantly affect overall analysis correctness.")
# # Output the percentage of commits marked as "Not related to BPF subsystem"
# print(f"Number of commits marked as 'Not related to BPF subsystem' in Implementation Component: {not_related_implementation_count} ({not_related_implementation_percentage:.2f}% of total commits)")
# print(f"Number of commits marked as 'Not related to BPF subsystem' in Logic Component: {not_related_logic_count} ({not_related_logic_percentage:.2f}% of total commits)")

# # Check if there's any significant mismatch in these percentages
# if not_related_implementation_percentage > 5 or not_related_logic_percentage > 5:
# print("\nThere is a significant number of commits marked as 'Not related to BPF subsystem'. This indicates potential data quality issues and suggests that data cleaning may be necessary to improve analysis correctness.")
# else:
# print("\nThe number of commits marked as 'Not related to BPF subsystem' is relatively low and does not significantly affect overall analysis correctness.")

print("\nSample Commit Messages of 'Not related to BPF subsystem' in Implementation Component:")
print(not_related_implementation_commit_messages)
Expand Down
24 changes: 18 additions & 6 deletions analysis/bpf/pie_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def plot_commit_classification_pie():
# Plot the pie chart
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Commit Classification', fontsize=12)
# ax.set_title('Commit Classification', fontsize=12)

# Save the figure
plt.savefig('imgs/commit_pie_chart_commit_classification.png')
Expand Down Expand Up @@ -51,8 +51,21 @@ def plot_commit_complexity_pie():
plt.savefig('imgs/commit_pie_chart_commit_complexity.png')
plt.close()

# Mapping long labels to short summaries
label_replacements = {
"The implementation happens in other subsystem and is related to eBPF events. e.g. probes perf events tracepoints network scheduler HID LSM etc. Note it's still related to how eBPF programs interact with these events.": "eBPF events (tracepoints, perf, etc.)",
"The eBPF verifier. This component ensures that eBPF programs are safe to run within the kernel.": "eBPF verifier",
"The eBPF maps. It changes how data structures shared between user-space and kernel-space (maps) are created or managed.": "eBPF maps",
"The eBPF JIT compiler for different architectures. It changes how eBPF bytecode is translated into machine code for different hardware architectures.": "eBPF JIT compiler",
"The helper and kfuncs. It modifies or adds helpers and kernel functions that eBPF programs can call.": "eBPF helpers and kfuncs",
"The syscall interface. It changes the system calls through which user-space programs interact with eBPF.": "Syscall interface"
}

# Function to plot pie chart for major implementation component
def plot_implementation_component_pie():
# Apply the label replacements
survey_data['major_related_implementation_component'] = survey_data['major_related_implementation_component'].replace(label_replacements)

# Get the value counts for major implementation component
value_counts = survey_data['major_related_implementation_component'].value_counts()

Expand All @@ -62,12 +75,11 @@ def plot_implementation_component_pie():
value_counts = value_counts[:max_labels]._append(pd.Series([value_counts[max_labels:].sum()], index=['Other']))

# Truncate labels for better readability
truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]
truncated_labels = [label[:30] + '...' if len(label) > 10 else label for label in value_counts.index]

# Plot the pie chart
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Major Implementation Component', fontsize=12)

# Save the figure
plt.savefig('imgs/commit_pie_chart_major_implementation_component.png')
Expand All @@ -89,7 +101,7 @@ def plot_logic_component_pie():
# Plot the pie chart
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Major Logic Component', fontsize=12)
# ax.set_title('Major Logic Component', fontsize=12)

# Save the figure
plt.savefig('imgs/commit_pie_chart_major_logic_component.png')
Expand Down Expand Up @@ -127,7 +139,7 @@ def plot_usecases_or_submodule_pie():
value_counts = flattened_usecases.value_counts()

# Aggregate smaller labels into "Other" if needed
max_labels = 8
max_labels = 10
if len(value_counts) > max_labels:
value_counts = value_counts[:max_labels]._append(pd.Series([value_counts[max_labels:].sum()], index=['Other']))

Expand All @@ -137,7 +149,7 @@ def plot_usecases_or_submodule_pie():
# Plot the pie chart
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Use Cases or Submodule Events', fontsize=12)
# ax.set_title('Use Cases or Submodule Events', fontsize=12)

# Save the figure
plt.savefig('imgs/commit_pie_chart_usecases_or_submodule_events.png')
Expand Down
31 changes: 20 additions & 11 deletions analysis/bpf/pie_most_buggy_componenet.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import pandas as pd
import re
import matplotlib.pyplot as plt

# Load the CSV file for analysis
file_path = '/mnt/data/commit_survey (19).csv'
file_path = 'data/commit_survey.csv' # Replace with your file path
data = pd.read_csv(file_path)

# Display the first few rows to understand its structure
data_head = data.head()

# Filter out commits related to "bug" or "fix" in the commit classification
buggy_commits = data[data['commit_classification'].str.contains("bug|fix", case=False, na=False)]

Expand All @@ -32,7 +31,7 @@
"The eBPF maps. It changes how data structures shared between user-space and kernel-space (maps) are created or managed.": "eBPF maps",
"The eBPF JIT compiler for different architectures. It changes how eBPF bytecode is translated into machine code for different hardware architectures.": "eBPF JIT compiler",
"The helper and kfuncs. It modifies or adds helpers and kernel functions that eBPF programs can call.": "eBPF helpers and kfuncs",
"The syscall interface. It modifies or adds code related to system calls between user-space and kernel-space.": "Syscall interface"
"The syscall interface. It changes the system calls through which user-space programs interact with eBPF.": "Syscall interface"
}

# Apply these replacements to the component column
Expand All @@ -42,6 +41,9 @@
main_components = ["eBPF events (tracepoints, perf, etc.)", "eBPF verifier", "eBPF maps", "eBPF JIT compiler", "eBPF helpers and kfuncs", "Syscall interface"]
filtered_buggy_components_cleaned['short_component'] = filtered_buggy_components_cleaned['short_component'].apply(lambda x: x if x in main_components else "Others")

# Count the number of bugs per component
component_bug_counts = filtered_buggy_components_cleaned['short_component'].value_counts()

# Extract file paths and count the occurrences
def extract_valid_file_paths(changed_files_entry):
file_paths = re.findall(r'([a-zA-Z0-9_/.-]+\.[ch])', changed_files_entry)
Expand All @@ -52,11 +54,18 @@ def extract_valid_file_paths(changed_files_entry):
# Top 10 files with the most bug fixes
top_valid_buggy_files = all_valid_changed_files.value_counts().head(10)

# Count number of files changed in each commit
filtered_buggy_components_cleaned['num_files_changed'] = filtered_buggy_components_cleaned['changed_files'].dropna().apply(lambda x: len(extract_valid_file_paths(x)))
# Save the pie chart of kernel components with most bugs to a variable
fig, ax = plt.subplots(figsize=(10, 6))
component_bug_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors, ax=ax)
# plt.title('Kernel Implementation Components with the Most Bugs')
plt.ylabel('')
plt.tight_layout()

# Save the figure to a variable
v = fig

# Distribution of commits by number of files changed
multiple_files_changed = filtered_buggy_components_cleaned['num_files_changed'].value_counts()
# Print the top 10 most buggy files
print("Top 10 Files with the Most Bug Fixes:\n", top_valid_buggy_files)

# Display the first few rows, top 10 files, and distribution of file changes
data_head, top_valid_buggy_files, multiple_files_changed
# # Show the pie chart (saved in variable 'v')
plt.savefig('imgs/kernel_components_most_buggy_pie_chart.pdf')
2 changes: 1 addition & 1 deletion analysis/bpf/timeline_commits_3m.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

ax.set_title(title, fontsize=16)
# ax.set_title(title, fontsize=16)
ax.set_xlabel('Time (3-Month Intervals)', fontsize=14)
ax.set_ylabel('Number of Commits', fontsize=14)

Expand Down
67 changes: 42 additions & 25 deletions analysis/bpf/timeline_commits_6m.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import matplotlib.dates as mdates
import os
import warnings
import math

# Optionally suppress FutureWarnings (not recommended for production)
# warnings.simplefilter(action='ignore', category=FutureWarning)
Expand Down Expand Up @@ -60,7 +61,7 @@ def parse_usecases(usecase_str):
# Update 'flattened_usecases' based on filtered data, excluding "not related" strings
flattened_usecases = pd.Series([
usecase for sublist in filtered_data['parsed_usecases'] for usecase in sublist
if not re.search(r"not relate", usecase, re.IGNORECASE)
if not re.search(r"not relate|merge|sure", usecase, re.IGNORECASE)
])

# Debug: Check the contents of flattened_usecases
Expand Down Expand Up @@ -162,7 +163,7 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path,
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

ax.set_title(title, fontsize=16)
# ax.set_title(title, fontsize=16)
ax.set_xlabel('Time (6-Month Intervals)', fontsize=14)
ax.set_ylabel('Number of Commits (Smoothed)', fontsize=14)

Expand All @@ -175,28 +176,27 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path,
plt.close()

print(f"Saved smoothed timeline chart to {save_path}")

# Function to plot timeline for use cases or submodule events with smoothing
def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005, smoothing_window=2):
def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005, smoothing_window=2, max_categories_per_fig=6):
"""
Plot a frequency-based timeline chart for use cases or submodule events with smoothing.
Plot a frequency-based timeline chart for use cases or submodule events with smoothing,
organizing them into subplots within a single figure if there are too many categories.
Parameters:
- title: Title of the chart.
- save_path: File path to save the chart.
- max_labels: Maximum number of labels to display (including 'Other').
- threshold: Minimum frequency proportion to consider as significant.
- smoothing_window: Window size for moving average.
- max_categories_per_subplot: Maximum number of categories to display per subplot.
"""
print(f"\nGenerating timeline for: {title}")

# Explode the 'parsed_usecases' lists into separate rows
exploded_data = filtered_data.explode('parsed_usecases')

# Remove "not related" cases
filter_pattern = re.compile(r'not relate', re.IGNORECASE)
filter_pattern = re.compile(r'not relate|merge', re.IGNORECASE)
exploded_data = exploded_data[~exploded_data['parsed_usecases'].str.contains(filter_pattern, na=False)]

# Remove NaN entries
exploded_data = exploded_data.dropna(subset=['parsed_usecases'])

Expand Down Expand Up @@ -226,32 +226,49 @@ def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005, smoo
# Apply moving average for smoothing
smoothed_counts = apply_moving_average(monthly_counts, window=smoothing_window)

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
# Determine the number of subplots needed
total_categories = len(smoothed_counts.columns)
num_subplots = math.ceil(total_categories / max_categories_per_fig)
print(f"Total categories: {total_categories}, will be split into {num_subplots} subplot(s)")

# Plot each category
for column in smoothed_counts.columns:
ax.plot(smoothed_counts.index, smoothed_counts[column], label=column)
# Create a single figure with multiple subplots
fig, axes = plt.subplots(num_subplots, 1, figsize=(10, 4 * num_subplots), sharex=True)
if num_subplots == 1:
axes = [axes] # Make it iterable

# Formatting the x-axis with date labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
for subplot_num, ax in enumerate(axes):
start_idx = subplot_num * max_categories_per_fig
end_idx = start_idx + max_categories_per_fig
categories_subset = smoothed_counts.columns[start_idx:end_idx]

ax.set_title(title, fontsize=16)
ax.set_xlabel('Time (6-Month Intervals)', fontsize=14)
ax.set_ylabel('Number of Commits (Smoothed)', fontsize=14)
# Plot each category in the subset
for column in categories_subset:
ax.plot(smoothed_counts.index, smoothed_counts[column], label=column)

# Truncate long labels for the legend
truncated_labels = [label[:20] + '...' if len(label) > 20 else label for label in smoothed_counts.columns]
# Formatting the x-axis with date labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.tick_params(axis='x', rotation=45)

# Add the legend with truncated labels
ax.legend(truncated_labels, loc='upper left', bbox_to_anchor=(1, 1)) # Place legend outside the plot
# Set titles and labels
subplot_title = f"{title} (Subplot {subplot_num + 1}/{num_subplots})"
# ax.set_title(subplot_title, fontsize=16)
ax.set_xlabel('Time (3-Month Intervals)', fontsize=14)
ax.set_ylabel('Number of Commits (Smoothed)', fontsize=14)

plt.tight_layout()
plt.savefig(save_path)
# Truncate long labels for the legend
truncated_labels = [label[:20] + '...' if len(label) > 20 else label for label in categories_subset]

# Add the legend with truncated labels
ax.legend(truncated_labels, loc='upper left', bbox_to_anchor=(1, 1)) # Place legend outside the plot

plt.tight_layout()

# Save the single figure
plt.savefig(save_path, bbox_inches='tight')
plt.close()

print(f"Saved smoothed timeline chart to {save_path}")
print("Timeline generation completed.")

# Define thresholds and max_labels per field
field_settings = {
Expand Down
2 changes: 1 addition & 1 deletion analysis/bpf/timeline_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
plt.figure(figsize=(8, 5))
df_cumulative_fixed.plot(ax=plt.gca())

plt.title('Cumulative BPF Features Commit Timeline with BPF Features')
# plt.title('Cumulative BPF Features Commit Timeline with BPF Features')
plt.xlabel('Date')
plt.ylabel('Cumulative Count of Features')
plt.legend(title='Feature Type')
Expand Down
2 changes: 1 addition & 1 deletion analysis/bpf/timeline_helper_vs_kfunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
plt.plot(df_helper_cumulative.index, df_helper_cumulative.values, label='Helper')
plt.plot(df_kfunc_cumulative.index, df_kfunc_cumulative.values, label='Kfunc', linestyle='--')

plt.title('Cumulative BPF Features: Helper vs Kfunc')
# plt.title('Cumulative BPF Features: Helper vs Kfunc')
plt.xlabel('Date')
plt.ylabel('Cumulative Count of Features')
plt.legend(title='Feature Type')
Expand Down
2 changes: 1 addition & 1 deletion analysis/bpf/timeline_link_attach.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
for feature_type in ['sock_ops', 'sock_opt_types', 'link_type', 'attach_types']:
plt.plot(df_cumulative_fixed.index, df_cumulative_fixed[feature_type], label=feature_type)

plt.title('Cumulative BPF Features: sock_ops, sock_opt_types, link_type, attach_types')
# plt.title('Cumulative BPF Features: sock_ops, sock_opt_types, link_type, attach_types')
plt.xlabel('Date')
plt.ylabel('Cumulative Count of Features')
plt.legend(title='Feature Type')
Expand Down
2 changes: 1 addition & 1 deletion analysis/bpf/timeline_verifier_related_bug_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
)

# Add titles and labels
plt.title('Verifier Instruction Modification vs. General Verifier Bugs Over Time')
# plt.title('Verifier Instruction Modification vs. General Verifier Bugs Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Commits')
plt.grid(True)
Expand Down
2 changes: 1 addition & 1 deletion analysis/bpf/timeline_without_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
plt.figure(figsize=(8, 5))
df_cumulative_fixed.plot(ax=plt.gca())

plt.title('Cumulative BPF Features Commit Timeline without Helper/Kfunc')
# plt.title('Cumulative BPF Features Commit Timeline without Helper/Kfunc')
plt.xlabel('Date')
plt.ylabel('Cumulative Count of Features')
plt.legend(title='Feature Type')
Expand Down
Binary file modified imgs/commit_pie_chart_commit_classification.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/commit_pie_chart_major_implementation_component.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/commit_pie_chart_major_logic_component.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/commit_pie_chart_usecases_or_submodule_events.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/cumulative_bpf_features_timeline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/cumulative_bpf_features_timeline_no_helper_kfunc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/cumulative_helper_kfunc_timeline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/cumulative_sock_link_features_timeline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file modified imgs/timeline_commit_classification_smoothed.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/timeline_commit_complexity_smoothed.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/timeline_libbpf_commit_classification_smoothed.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/timeline_major_related_logic_component_smoothed.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/timeline_usecases_or_submodule_events_smoothed.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imgs/verifier_features_vs_general_bugs_over_time.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 8eb91a3

Please sign in to comment.