Skip to content

Commit

Permalink
new imgs
Browse files Browse the repository at this point in the history
  • Loading branch information
yunwei37 committed Sep 17, 2024
1 parent ed9f0c1 commit bb901f9
Show file tree
Hide file tree
Showing 32 changed files with 310,017 additions and 163 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ Code-Survey helps you `explore` and `analyze`. the world's largest and most intr

With the power of AI and Large Language Models (LLMs), you can ask questions, run queries, and gain a deeper understanding of how systems evolve over time. AI Agents can also help you analysis that. Whether you're a developer, researcher, or enthusiast, Code-Survey bridges the gap between `design`, `implementation`, `maintenance`, `reliability` and `security`, making complex systems more accessible.


Unlike other approaches:

- **No human can do that before, but AI can.**
Expand Down
10 changes: 5 additions & 5 deletions analysis/bpf/pie_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def plot_commit_classification_pie():
truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]

# Plot the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Commit Classification', fontsize=12)

Expand All @@ -43,7 +43,7 @@ def plot_commit_complexity_pie():
truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]

# Plot the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Commit Complexity', fontsize=12)

Expand All @@ -65,7 +65,7 @@ def plot_implementation_component_pie():
truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]

# Plot the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Major Implementation Component', fontsize=12)

Expand All @@ -87,7 +87,7 @@ def plot_logic_component_pie():
truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]

# Plot the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Major Logic Component', fontsize=12)

Expand Down Expand Up @@ -135,7 +135,7 @@ def plot_usecases_or_submodule_pie():
truncated_labels = [label[:20] + '...' if len(label) > 10 else label for label in value_counts.index]

# Plot the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
fig, ax = plt.subplots(figsize=(7, 5))
ax.pie(value_counts, labels=truncated_labels, autopct='%1.1f%%', startangle=90)
ax.set_title('Use Cases or Submodule Events', fontsize=12)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,14 @@ def get_significant_categories(series, max_labels, threshold=0.01):
"""
# Calculate frequency proportion
freq = series.value_counts(normalize=True)

# Determine categories above the threshold
significant = freq[freq >= threshold].index.tolist()

# If significant categories exceed max_labels, take the top max_labels -1 and add 'Other'
if len(significant) > (max_labels - 1):
significant = freq.nlargest(max_labels - 1).index.tolist()

return significant

# Function to plot frequency-based timeline charts for categorical fields
Expand All @@ -93,17 +93,18 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path)
- save_path: File path to save the chart.
"""
print(f"\nGenerating timeline for: {title}")

# Group by month and category, count commits
monthly_counts = filtered_data.resample('M')[field_name].value_counts().unstack(fill_value=0)


# Group by 3-month intervals and category, count commits
# '3M' stands for 3-month frequency; alternatively, you can use 'Q' for quarters
monthly_counts = filtered_data.resample('3M')[field_name].value_counts().unstack(fill_value=0)

# Determine significant categories
if field_name == 'usecases_or_submodule_events':
data_series = flattened_usecases
else:
data_series = filtered_data[field_name]
significant_categories = get_significant_categories(data_series, max_labels, threshold)

# If 'Other' needs to be added, sum the non-significant categories
if len(significant_categories) < len(monthly_counts.columns):
# Ensure 'Other' is not already a category
Expand All @@ -114,28 +115,32 @@ def plot_frequency_timeline(field_name, title, max_labels, threshold, save_path)
monthly_counts['Other'] += monthly_counts.drop(significant_categories, axis=1).sum(axis=1)
# Keep only significant categories and 'Other'
monthly_counts = monthly_counts[significant_categories + ['Other']]

# Plotting
fig, ax = plt.subplots(figsize=(14, 14))
fig, ax = plt.subplots(figsize=(14, 8))

# Plot each category
for column in monthly_counts.columns:
ax.plot(monthly_counts.index, monthly_counts[column], label=column)

# Formatting the x-axis with date labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

ax.set_title(title, fontsize=16)
ax.set_xlabel('Time (Monthly)', fontsize=14)
ax.set_xlabel('Time (3-Month Intervals)', fontsize=14)
ax.set_ylabel('Number of Commits', fontsize=14)
ax.legend(loc='upper left', bbox_to_anchor=(1,1)) # Place legend outside the plot

# plt.tight_layout()

truncated_labels = [label[:20] + '...' if len(label) > 20 else label for label in monthly_counts.columns]

ax.legend(truncated_labels, loc='upper left', bbox_to_anchor=(1,1)) # Place legend outside the plot

plt.tight_layout()
plt.savefig(save_path)
plt.close()

print(f"Saved timeline chart to {save_path}")

# Function to plot timeline for use cases or submodule events
def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005):
"""
Expand All @@ -148,7 +153,7 @@ def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005):
- threshold: Minimum frequency proportion to consider as significant.
"""
print(f"\nGenerating timeline for: {title}")

# Explode the 'parsed_usecases' lists into separate rows
exploded_data = filtered_data.explode('parsed_usecases')

Expand All @@ -159,12 +164,12 @@ def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005):
# Remove NaN entries
exploded_data = exploded_data.dropna(subset=['parsed_usecases'])

# Group by month and use case, count commits
monthly_counts = exploded_data.resample('ME')['parsed_usecases'].value_counts().unstack(fill_value=0)
# Group by 3-month intervals and use case, count commits
monthly_counts = exploded_data.resample('3M')['parsed_usecases'].value_counts().unstack(fill_value=0)

# Determine significant categories
significant_categories = get_significant_categories(flattened_usecases, max_labels, threshold)

# Only drop categories that exist in the DataFrame
columns_to_drop = [col for col in monthly_counts.columns if col not in significant_categories]

Expand All @@ -173,32 +178,32 @@ def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005):
# Add 'Other' column as the sum of non-significant categories
monthly_counts['Other'] = monthly_counts[columns_to_drop].sum(axis=1)
monthly_counts = monthly_counts.drop(columns=columns_to_drop, axis=1)

# Plotting
fig, ax = plt.subplots(figsize=(14, 14))
fig, ax = plt.subplots(figsize=(14, 8))

# Plot each category
for column in monthly_counts.columns:
ax.plot(monthly_counts.index, monthly_counts[column], label=column)

# Formatting the x-axis with date labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)

ax.set_title(title, fontsize=16)
ax.set_xlabel('Time (Monthly)', fontsize=14)
ax.set_xlabel('Time (3-Month Intervals)', fontsize=14)
ax.set_ylabel('Number of Commits', fontsize=14)

# Truncate long labels for the legend
truncated_labels = [label[:20] + '...' if len(label) > 20 else label for label in monthly_counts.columns]

# Add the legend with truncated labels
ax.legend(truncated_labels, loc='upper left', bbox_to_anchor=(1, 1)) # Place legend outside the plot

plt.tight_layout()
plt.savefig(save_path)
plt.close()

print(f"Saved timeline chart to {save_path}")

# Define thresholds and max_labels per field
Expand All @@ -224,7 +229,7 @@ def plot_usecases_timeline(title, save_path, max_labels=8, threshold=0.005):
plot_usecases_timeline(
title='Use Cases or Submodule Events Over Time',
save_path='imgs/timeline_usecases_or_submodule_events.png',
max_labels=10,
max_labels=12,
threshold=0.005 # Adjusted threshold for more use cases
)

Expand Down
Loading

0 comments on commit bb901f9

Please sign in to comment.