update imgs

eunomia-bpf · Sep 23, 2024 · 8eb91a3 · 8eb91a3
1 parent 3a0a65e
commit 8eb91a3
Show file tree

Hide file tree

Showing 26 changed files with 95 additions and 57 deletions.
diff --git a/analysis/bpf/distribution_commits.py b/analysis/bpf/distribution_commits.py
@@ -251,15 +251,15 @@ def has_event_related_usecase(usecase_list):
 not_related_implementation_percentage = (not_related_implementation_count / total_commits) * 100
 not_related_logic_percentage = (not_related_logic_count / total_commits) * 100
 
-# Output the percentage of commits marked as "Not related to BPF subsystem"
-print(f"Number of commits marked as 'Not related to BPF subsystem' in Implementation Component: {not_related_implementation_count} ({not_related_implementation_percentage:.2f}% of total commits)")
-print(f"Number of commits marked as 'Not related to BPF subsystem' in Logic Component: {not_related_logic_count} ({not_related_logic_percentage:.2f}% of total commits)")
-
-# Check if there's any significant mismatch in these percentages
-if not_related_implementation_percentage > 5 or not_related_logic_percentage > 5:
-    print("\nThere is a significant number of commits marked as 'Not related to BPF subsystem'. This indicates potential data quality issues and suggests that data cleaning may be necessary to improve analysis correctness.")
-else:
-    print("\nThe number of commits marked as 'Not related to BPF subsystem' is relatively low and does not significantly affect overall analysis correctness.")
+# # Output the percentage of commits marked as "Not related to BPF subsystem"
+# print(f"Number of commits marked as 'Not related to BPF subsystem' in Implementation Component: {not_related_implementation_count} ({not_related_implementation_percentage:.2f}% of total commits)")
+# print(f"Number of commits marked as 'Not related to BPF subsystem' in Logic Component: {not_related_logic_count} ({not_related_logic_percentage:.2f}% of total commits)")
+
+# # Check if there's any significant mismatch in these percentages
+# if not_related_implementation_percentage > 5 or not_related_logic_percentage > 5:
+#     print("\nThere is a significant number of commits marked as 'Not related to BPF subsystem'. This indicates potential data quality issues and suggests that data cleaning may be necessary to improve analysis correctness.")
+# else:
+#     print("\nThe number of commits marked as 'Not related to BPF subsystem' is relatively low and does not significantly affect overall analysis correctness.")
 
 print("\nSample Commit Messages of 'Not related to BPF subsystem' in Implementation Component:")
 print(not_related_implementation_commit_messages)

diff --git a/analysis/bpf/pie_commits.py b/analysis/bpf/pie_commits.py
@@ -23,7 +23,7 @@ def plot_commit_classification_pie():
     # Plot the pie chart
     fig, ax = plt.subplots(figsize=(7, 5))
     ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
-    ax.set_title('Commit Classification', fontsize=12)
+    # ax.set_title('Commit Classification', fontsize=12)
 
     # Save the figure
     plt.savefig('imgs/commit_pie_chart_commit_classification.png')
@@ -51,8 +51,21 @@ def plot_commit_complexity_pie():
     plt.savefig('imgs/commit_pie_chart_commit_complexity.png')
     plt.close()
 
+# Mapping long labels to short summaries
+label_replacements = {
+    "The implementation happens in other subsystem and is related to eBPF events. e.g. probes perf events tracepoints network scheduler HID LSM etc. Note it's still related to how eBPF programs interact with these events.": "eBPF events (tracepoints, perf, etc.)",
+    "The eBPF verifier. This component ensures that eBPF programs are safe to run within the kernel.": "eBPF verifier",
+    "The eBPF maps. It changes how data structures shared between user-space and kernel-space (maps) are created or managed.": "eBPF maps",
+    "The eBPF JIT compiler for different architectures. It changes how eBPF bytecode is translated into machine code for different hardware architectures.": "eBPF JIT compiler",
+    "The helper and kfuncs. It modifies or adds helpers and kernel functions that eBPF programs can call.": "eBPF helpers and kfuncs",
+    "The syscall interface. It changes the system calls through which user-space programs interact with eBPF.": "Syscall interface"
+}
+
 # Function to plot pie chart for major implementation component
 def plot_implementation_component_pie():
+    # Apply the label replacements
+    survey_data['major_related_implementation_component'] = survey_data['major_related_implementation_component'].replace(label_replacements)
+
     # Get the value counts for major implementation component
     value_counts = survey_data['major_related_implementation_component'].value_counts()
 
@@ -62,12 +75,11 @@ def plot_implementation_component_pie():
         value_counts = value_counts[:max_labels]._append(pd.Series([value_counts[max_labels:].sum()], index=['Other']))
 
     # Truncate labels for better readability
-    truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]
+    truncated_labels = [label[:30] + '...' if len(label) > 10 else label for label in value_counts.index]
 
     # Plot the pie chart
     fig, ax = plt.subplots(figsize=(7, 5))
     ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
-    ax.set_title('Major Implementation Component', fontsize=12)
 
     # Save the figure
     plt.savefig('imgs/commit_pie_chart_major_implementation_component.png')
@@ -89,7 +101,7 @@ def plot_logic_component_pie():
     # Plot the pie chart
     fig, ax = plt.subplots(figsize=(7, 5))
     ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
-    ax.set_title('Major Logic Component', fontsize=12)
+    # ax.set_title('Major Logic Component', fontsize=12)
 
     # Save the figure
     plt.savefig('imgs/commit_pie_chart_major_logic_component.png')
@@ -127,7 +139,7 @@ def plot_usecases_or_submodule_pie():
     value_counts = flattened_usecases.value_counts()
 
     # Aggregate smaller labels into "Other" if needed
-    max_labels = 8
+    max_labels = 10
     if len(value_counts) > max_labels:
         value_counts = value_counts[:max_labels]._append(pd.Series([value_counts[max_labels:].sum()], index=['Other']))
 
@@ -137,7 +149,7 @@ def plot_usecases_or_submodule_pie():
     # Plot the pie chart
     fig, ax = plt.subplots(figsize=(7, 5))
     ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
-    ax.set_title('Use Cases or Submodule Events', fontsize=12)
+    # ax.set_title('Use Cases or Submodule Events', fontsize=12)
 
     # Save the figure
     plt.savefig('imgs/commit_pie_chart_usecases_or_submodule_events.png')

diff --git a/analysis/bpf/pie_most_buggy_componenet.py b/analysis/bpf/pie_most_buggy_componenet.py
@@ -1,12 +1,11 @@
 import pandas as pd
+import re
+import matplotlib.pyplot as plt
 
 # Load the CSV file for analysis
-file_path = '/mnt/data/commit_survey (19).csv'
+file_path = 'data/commit_survey.csv'  # Replace with your file path
 data = pd.read_csv(file_path)
 
-# Display the first few rows to understand its structure
-data_head = data.head()
-
 # Filter out commits related to "bug" or "fix" in the commit classification
 buggy_commits = data[data['commit_classification'].str.contains("bug|fix", case=False, na=False)]
 
@@ -32,7 +31,7 @@
     "The eBPF maps. It changes how data structures shared between user-space and kernel-space (maps) are created or managed.": "eBPF maps",
     "The eBPF JIT compiler for different architectures. It changes how eBPF bytecode is translated into machine code for different hardware architectures.": "eBPF JIT compiler",
     "The helper and kfuncs. It modifies or adds helpers and kernel functions that eBPF programs can call.": "eBPF helpers and kfuncs",
-    "The syscall interface. It modifies or adds code related to system calls between user-space and kernel-space.": "Syscall interface"
+    "The syscall interface. It changes the system calls through which user-space programs interact with eBPF.": "Syscall interface"
 }
 
 # Apply these replacements to the component column
@@ -42,6 +41,9 @@
 main_components = ["eBPF events (tracepoints, perf, etc.)", "eBPF verifier", "eBPF maps", "eBPF JIT compiler", "eBPF helpers and kfuncs", "Syscall interface"]
 filtered_buggy_components_cleaned['short_component'] = filtered_buggy_components_cleaned['short_component'].apply(lambda x: x if x in main_components else "Others")
 
+# Count the number of bugs per component
+component_bug_counts = filtered_buggy_components_cleaned['short_component'].value_counts()
+
 # Extract file paths and count the occurrences
 def extract_valid_file_paths(changed_files_entry):
     file_paths = re.findall(r'([a-zA-Z0-9_/.-]+\.[ch])', changed_files_entry)
@@ -52,11 +54,18 @@ def extract_valid_file_paths(changed_files_entry):
 # Top 10 files with the most bug fixes
 top_valid_buggy_files = all_valid_changed_files.value_counts().head(10)
 
-# Count number of files changed in each commit
-filtered_buggy_components_cleaned['num_files_changed'] = filtered_buggy_components_cleaned['changed_files'].dropna().apply(lambda x: len(extract_valid_file_paths(x)))
+# Save the pie chart of kernel components with most bugs to a variable
+fig, ax = plt.subplots(figsize=(10, 6))
+component_bug_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors, ax=ax)
+# plt.title('Kernel Implementation Components with the Most Bugs')
+plt.ylabel('')
+plt.tight_layout()
+
+# Save the figure to a variable
+v = fig
 
-# Distribution of commits by number of files changed
-multiple_files_changed = filtered_buggy_components_cleaned['num_files_changed'].value_counts()
+# Print the top 10 most buggy files
+print("Top 10 Files with the Most Bug Fixes:\n", top_valid_buggy_files)
 
-# Display the first few rows, top 10 files, and distribution of file changes
-data_head, top_valid_buggy_files, multiple_files_changed
+# # Show the pie chart (saved in variable 'v')
+plt.savefig('imgs/kernel_components_most_buggy_pie_chart.pdf')
diff --git a/analysis/bpf/timeline_commits_3m.py b/analysis/bpf/timeline_commits_3m.py
@@ -127,7 +127,7 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path)
     ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
     plt.xticks(rotation=45)
 
-    ax.set_title(title, fontsize=16)
+    # ax.set_title(title, fontsize=16)
     ax.set_xlabel('Time (3-Month Intervals)', fontsize=14)
     ax.set_ylabel('Number of Commits', fontsize=14)
 

diff --git a/analysis/bpf/timeline_commits_6m.py b/analysis/bpf/timeline_commits_6m.py
@@ -5,6 +5,7 @@
 import matplotlib.dates as mdates
 import os
 import warnings
+import math
 
 # Optionally suppress FutureWarnings (not recommended for production)
 # warnings.simplefilter(action='ignore', category=FutureWarning)
@@ -60,7 +61,7 @@ def parse_usecases(usecase_str):
 # Update 'flattened_usecases' based on filtered data, excluding "not related" strings
 flattened_usecases = pd.Series([
     usecase for sublist in filtered_data['parsed_usecases'] for usecase in sublist
-    if not re.search(r"not relate", usecase, re.IGNORECASE)
+    if not re.search(r"not relate|merge|sure", usecase, re.IGNORECASE)
 ])
 
 # Debug: Check the contents of flattened_usecases
@@ -162,7 +163,7 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path,
     ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
     plt.xticks(rotation=45)
 
-    ax.set_title(title, fontsize=16)
+    # ax.set_title(title, fontsize=16)
     ax.set_xlabel('Time (6-Month Intervals)', fontsize=14)
     ax.set_ylabel('Number of Commits (Smoothed)', fontsize=14)
 
@@ -175,28 +176,27 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path,
     plt.close()
 
     print(f"Saved smoothed timeline chart to {save_path}")
-
-# Function to plot timeline for use cases or submodule events with smoothing
-def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005, smoothing_window=2):
+def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005, smoothing_window=2, max_categories_per_fig=6):
     """
-    Plot a frequency-based timeline chart for use cases or submodule events with smoothing.
+    Plot a frequency-based timeline chart for use cases or submodule events with smoothing,
+    organizing them into subplots within a single figure if there are too many categories.
 
     Parameters:
     - title: Title of the chart.
     - save_path: File path to save the chart.
     - max_labels: Maximum number of labels to display (including 'Other').
     - threshold: Minimum frequency proportion to consider as significant.
     - smoothing_window: Window size for moving average.
+    - max_categories_per_subplot: Maximum number of categories to display per subplot.
     """
     print(f"\nGenerating timeline for: {title}")
 
     # Explode the 'parsed_usecases' lists into separate rows
     exploded_data = filtered_data.explode('parsed_usecases')
 
     # Remove "not related" cases
-    filter_pattern = re.compile(r'not relate', re.IGNORECASE)
+    filter_pattern = re.compile(r'not relate|merge', re.IGNORECASE)
     exploded_data = exploded_data[~exploded_data['parsed_usecases'].str.contains(filter_pattern, na=False)]
-
     # Remove NaN entries
     exploded_data = exploded_data.dropna(subset=['parsed_usecases'])
 
@@ -226,32 +226,49 @@ def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005, smoo
     # Apply moving average for smoothing
     smoothed_counts = apply_moving_average(monthly_counts, window=smoothing_window)
 
-    # Plotting
-    fig, ax = plt.subplots(figsize=(10, 6))
+    # Determine the number of subplots needed
+    total_categories = len(smoothed_counts.columns)
+    num_subplots = math.ceil(total_categories / max_categories_per_fig)
+    print(f"Total categories: {total_categories}, will be split into {num_subplots} subplot(s)")
 
-    # Plot each category
-    for column in smoothed_counts.columns:
-        ax.plot(smoothed_counts.index, smoothed_counts[column], label=column)
+    # Create a single figure with multiple subplots
+    fig, axes = plt.subplots(num_subplots, 1, figsize=(10, 4 * num_subplots), sharex=True)
+    if num_subplots == 1:
+        axes = [axes]  # Make it iterable
 
-    # Formatting the x-axis with date labels
-    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
-    plt.xticks(rotation=45)
+    for subplot_num, ax in enumerate(axes):
+        start_idx = subplot_num * max_categories_per_fig
+        end_idx = start_idx + max_categories_per_fig
+        categories_subset = smoothed_counts.columns[start_idx:end_idx]
 
-    ax.set_title(title, fontsize=16)
-    ax.set_xlabel('Time (6-Month Intervals)', fontsize=14)
-    ax.set_ylabel('Number of Commits (Smoothed)', fontsize=14)
+        # Plot each category in the subset
+        for column in categories_subset:
+            ax.plot(smoothed_counts.index, smoothed_counts[column], label=column)
 
-    # Truncate long labels for the legend
-    truncated_labels = [label[:20] + '...' if len(label) > 20 else label for label in smoothed_counts.columns]
+        # Formatting the x-axis with date labels
+        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
+        ax.tick_params(axis='x', rotation=45)
 
-    # Add the legend with truncated labels
-    ax.legend(truncated_labels, loc='upper left', bbox_to_anchor=(1, 1))  # Place legend outside the plot
+        # Set titles and labels
+        subplot_title = f"{title} (Subplot {subplot_num + 1}/{num_subplots})"
+        # ax.set_title(subplot_title, fontsize=16)
+        ax.set_xlabel('Time (3-Month Intervals)', fontsize=14)
+        ax.set_ylabel('Number of Commits (Smoothed)', fontsize=14)
 
-    plt.tight_layout()
-    plt.savefig(save_path)
+        # Truncate long labels for the legend
+        truncated_labels = [label[:20] + '...' if len(label) > 20 else label for label in categories_subset]
+
+        # Add the legend with truncated labels
+        ax.legend(truncated_labels, loc='upper left', bbox_to_anchor=(1, 1))  # Place legend outside the plot
+
+        plt.tight_layout()
+
+    # Save the single figure
+    plt.savefig(save_path, bbox_inches='tight')
     plt.close()
 
     print(f"Saved smoothed timeline chart to {save_path}")
+    print("Timeline generation completed.")
 
 # Define thresholds and max_labels per field
 field_settings = {

diff --git a/analysis/bpf/timeline_features.py b/analysis/bpf/timeline_features.py
@@ -39,7 +39,7 @@
 plt.figure(figsize=(8, 5))
 df_cumulative_fixed.plot(ax=plt.gca())
 
-plt.title('Cumulative BPF Features Commit Timeline with BPF Features')
+# plt.title('Cumulative BPF Features Commit Timeline with BPF Features')
 plt.xlabel('Date')
 plt.ylabel('Cumulative Count of Features')
 plt.legend(title='Feature Type')

diff --git a/analysis/bpf/timeline_helper_vs_kfunc.py b/analysis/bpf/timeline_helper_vs_kfunc.py
@@ -54,7 +54,7 @@
 plt.plot(df_helper_cumulative.index, df_helper_cumulative.values, label='Helper')
 plt.plot(df_kfunc_cumulative.index, df_kfunc_cumulative.values, label='Kfunc', linestyle='--')
 
-plt.title('Cumulative BPF Features: Helper vs Kfunc')
+# plt.title('Cumulative BPF Features: Helper vs Kfunc')
 plt.xlabel('Date')
 plt.ylabel('Cumulative Count of Features')
 plt.legend(title='Feature Type')

diff --git a/analysis/bpf/timeline_link_attach.py b/analysis/bpf/timeline_link_attach.py
@@ -42,7 +42,7 @@
 for feature_type in ['sock_ops', 'sock_opt_types', 'link_type', 'attach_types']:
     plt.plot(df_cumulative_fixed.index, df_cumulative_fixed[feature_type], label=feature_type)
 
-plt.title('Cumulative BPF Features: sock_ops, sock_opt_types, link_type, attach_types')
+# plt.title('Cumulative BPF Features: sock_ops, sock_opt_types, link_type, attach_types')
 plt.xlabel('Date')
 plt.ylabel('Cumulative Count of Features')
 plt.legend(title='Feature Type')

diff --git a/analysis/bpf/timeline_verifier_related_bug_feature.py b/analysis/bpf/timeline_verifier_related_bug_feature.py
@@ -70,7 +70,7 @@
 )
 
 # Add titles and labels
-plt.title('Verifier Instruction Modification vs. General Verifier Bugs Over Time')
+# plt.title('Verifier Instruction Modification vs. General Verifier Bugs Over Time')
 plt.xlabel('Date')
 plt.ylabel('Number of Commits')
 plt.grid(True)

diff --git a/analysis/bpf/timeline_without_helper.py b/analysis/bpf/timeline_without_helper.py
@@ -37,7 +37,7 @@
 plt.figure(figsize=(8, 5))
 df_cumulative_fixed.plot(ax=plt.gca())
 
-plt.title('Cumulative BPF Features Commit Timeline without Helper/Kfunc')
+# plt.title('Cumulative BPF Features Commit Timeline without Helper/Kfunc')
 plt.xlabel('Date')
 plt.ylabel('Cumulative Count of Features')
 plt.legend(title='Feature Type')

diff --git a/imgs/commit_pie_chart_commit_classification.png b/imgs/commit_pie_chart_commit_classification.png
diff --git a/imgs/commit_pie_chart_major_implementation_component.png b/imgs/commit_pie_chart_major_implementation_component.png
diff --git a/imgs/commit_pie_chart_major_logic_component.png b/imgs/commit_pie_chart_major_logic_component.png
diff --git a/imgs/commit_pie_chart_usecases_or_submodule_events.png b/imgs/commit_pie_chart_usecases_or_submodule_events.png
diff --git a/imgs/cumulative_bpf_features_timeline.png b/imgs/cumulative_bpf_features_timeline.png
diff --git a/imgs/cumulative_bpf_features_timeline_no_helper_kfunc.png b/imgs/cumulative_bpf_features_timeline_no_helper_kfunc.png
diff --git a/imgs/cumulative_helper_kfunc_timeline.png b/imgs/cumulative_helper_kfunc_timeline.png
diff --git a/imgs/cumulative_sock_link_features_timeline.png b/imgs/cumulative_sock_link_features_timeline.png
diff --git a/imgs/kernel_components_most_buggy_pie_chart.pdf b/imgs/kernel_components_most_buggy_pie_chart.pdf
diff --git a/imgs/timeline_commit_classification_smoothed.png b/imgs/timeline_commit_classification_smoothed.png
diff --git a/imgs/timeline_commit_complexity_smoothed.png b/imgs/timeline_commit_complexity_smoothed.png
diff --git a/imgs/timeline_libbpf_commit_classification_smoothed.png b/imgs/timeline_libbpf_commit_classification_smoothed.png
diff --git a/imgs/timeline_major_related_implementation_component_smoothed.png b/imgs/timeline_major_related_implementation_component_smoothed.png
diff --git a/imgs/timeline_major_related_logic_component_smoothed.png b/imgs/timeline_major_related_logic_component_smoothed.png
diff --git a/imgs/timeline_usecases_or_submodule_events_smoothed.png b/imgs/timeline_usecases_or_submodule_events_smoothed.png
diff --git a/imgs/verifier_features_vs_general_bugs_over_time.png b/imgs/verifier_features_vs_general_bugs_over_time.png