Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add and update commit frequency metrics #173

Merged
merged 10 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ root = "."
[tool.ruff]
target-version = "py311"
fix = true

lint.select = [
# mccabe
"C90",
Expand Down
43 changes: 31 additions & 12 deletions src/almanack/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pathlib
import tempfile
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union

import pygit2
from charset_normalizer import from_bytes
Expand Down Expand Up @@ -42,9 +42,8 @@ def get_commits(repo: pygit2.Repository) -> List[pygit2.Commit]:
# Get the latest commit (HEAD) from the repository
head = repo.revparse_single("HEAD")
# Create a walker to iterate over commits starting from the HEAD
walker = repo.walk(
head.id, pygit2.enums.SortMode.NONE
) # SortMode.NONE traverses commits in natural order; no sorting applied.
# sorting by time.
walker = repo.walk(head.id, pygit2.GIT_SORT_TIME)
# Collect all commits from the walker into a list
commits = list(walker)
return commits
Expand Down Expand Up @@ -147,14 +146,6 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]:
return str(source_commit.id), str(target_commit.id)


"""
Module for handling various tasks with git repo blobs.
"""


import pygit2


def detect_encoding(blob_data: bytes) -> str:
"""
Detect the encoding of the given blob data using charset-normalizer.
Expand Down Expand Up @@ -227,6 +218,34 @@ def find_file(
return found_entry


def count_files(tree: Union[pygit2.Tree, pygit2.Blob]) -> int:
d33bs marked this conversation as resolved.
Show resolved Hide resolved
"""
Counts all files (Blobs) within a Git tree, including files
in subdirectories.

This function recursively traverses the provided `tree`
object to count each file, represented as a `pygit2.Blob`,
within the tree and any nested subdirectories.

Args:
tree (Union[pygit2.Tree, pygit2.Blob]):
The Git tree object (of type `pygit2.Tree`)
to traverse and count files. The initial call
should be made with the root tree of a commit.

Returns:
int:
The total count of files (Blobs) within the tree,
including nested files in subdirectories.
"""
if isinstance(tree, pygit2.Blob):
# Directly return 1 if the input is a Blob
return 1
elif isinstance(tree, pygit2.Tree):
# Recursively count files for Tree
return sum(count_files(entry) for entry in tree)


def read_file(
repo: pygit2.Repository,
entry: Optional[pygit2.Object] = None,
Expand Down
181 changes: 112 additions & 69 deletions src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
import pygit2
import yaml

from ..git import clone_repository, find_file, get_commits, get_edited_files, read_file
from ..git import (
clone_repository,
count_files,
find_file,
get_commits,
get_edited_files,
read_file,
)
from .entropy.calculate_entropy import (
calculate_aggregate_entropy,
calculate_normalized_entropy,
Expand Down Expand Up @@ -195,6 +202,41 @@ def default_branch_is_not_master(repo: pygit2.Repository) -> bool:
return repo.head.shorthand != "master"


def days_of_development(repo: pygit2.Repository) -> float:
"""


Args:
repo (pygit2.Repository): Path to the git repository.

Returns:
float: The average number of commits per day over the period of time.
"""
try:
# Try to get the HEAD commit. If it raises an error, there are no commits.
repo.revparse_single("HEAD")
except KeyError:
# If HEAD doesn't exist (repo is empty), return 0 commits.
return 0

# Traverse the commit history and collect commit dates
commit_dates = [
datetime.fromtimestamp(commit.commit_time).date()
for commit in repo.walk(repo.head.target, pygit2.GIT_SORT_TIME)
]

# If no commits, return 0
if not commit_dates:
return 0

# Calculate the number of days between the first and last commit
# +1 to include the first day
total_days = (max(commit_dates) - min(commit_dates)).days + 1

# Return the average commits per day
return total_days


def includes_common_docs(repo: pygit2.Repository) -> bool:
"""
Check whether the repo includes common documentation files and directories
Expand Down Expand Up @@ -244,74 +286,75 @@ def compute_repo_data(repo_path: str) -> None:
Returns:
dict: A dictionary containing data key-pairs.
"""
try:
# Convert repo_path to an absolute path and initialize the repository
repo_path = pathlib.Path(repo_path).resolve()
repo = pygit2.Repository(str(repo_path))

# Retrieve the list of commits from the repository
commits = get_commits(repo)
most_recent_commit = commits[0]
first_commit = commits[-1]

# Get a list of files that have been edited between the first and most recent commit
file_names = get_edited_files(repo, first_commit, most_recent_commit)

# Calculate the normalized total entropy for the repository
normalized_total_entropy = calculate_aggregate_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
file_names,
)

# Calculate the normalized entropy for the changes between the first and most recent commits
file_entropy = calculate_normalized_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
file_names,
)
# Convert commit times to UTC datetime objects, then format as date strings.
first_commit_date, most_recent_commit_date = (
datetime.fromtimestamp(commit.commit_time, tz=timezone.utc)
.date()
.isoformat()
for commit in (first_commit, most_recent_commit)
)

# Return the data structure
return {
"repo-path": str(repo_path),
"repo-commits": len(commits),
"repo-file-count": len(file_names),
"repo-commit-time-range": (first_commit_date, most_recent_commit_date),
"repo-includes-readme": file_exists_in_repo(
repo=repo,
expected_file_name="readme",
),
"repo-includes-contributing": file_exists_in_repo(
repo=repo,
expected_file_name="contributing",
),
"repo-includes-code-of-conduct": file_exists_in_repo(
repo=repo,
expected_file_name="code_of_conduct",
),
"repo-includes-license": file_exists_in_repo(
repo=repo,
expected_file_name="license",
),
"repo-is-citable": is_citable(repo=repo),
"repo-default-branch-not-master": default_branch_is_not_master(repo=repo),
"repo-includes-common-docs": includes_common_docs(repo=repo),
"repo-agg-info-entropy": normalized_total_entropy,
"repo-file-info-entropy": file_entropy,
}

except Exception as e:
# If processing fails, return an error dictionary
return {"repo_path": str(repo_path), "error": str(e)}
# Convert repo_path to an absolute path and initialize the repository
repo_path = pathlib.Path(repo_path).resolve()
repo = pygit2.Repository(str(repo_path))

# Retrieve the list of commits from the repository
commits = get_commits(repo)
most_recent_commit = commits[0]
first_commit = commits[-1]

# Get a list of files that have been edited between the first and most recent commit
edited_file_names = get_edited_files(repo, first_commit, most_recent_commit)

# Calculate the normalized total entropy for the repository
normalized_total_entropy = calculate_aggregate_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
edited_file_names,
)

# Calculate the normalized entropy for the changes between the first and most recent commits
file_entropy = calculate_normalized_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
edited_file_names,
)
# Convert commit times to UTC datetime objects, then format as date strings.
first_commit_date, most_recent_commit_date = (
datetime.fromtimestamp(commit.commit_time).date()
for commit in (first_commit, most_recent_commit)
)

# Return the data structure
return {
"repo-path": str(repo_path),
"repo-commits": (commits_count := len(commits)),
"repo-file-count": count_files(tree=most_recent_commit.tree),
"repo-commit-time-range": (
first_commit_date.isoformat(),
most_recent_commit_date.isoformat(),
),
"repo-days-of-development": (
days_of_development := (most_recent_commit_date - first_commit_date).days
+ 1
),
"repo-commits-per-day": commits_count / days_of_development,
"repo-includes-readme": file_exists_in_repo(
repo=repo,
expected_file_name="readme",
),
"repo-includes-contributing": file_exists_in_repo(
repo=repo,
expected_file_name="contributing",
),
"repo-includes-code-of-conduct": file_exists_in_repo(
repo=repo,
expected_file_name="code_of_conduct",
),
"repo-includes-license": file_exists_in_repo(
repo=repo,
expected_file_name="license",
),
"repo-is-citable": is_citable(repo=repo),
"repo-default-branch-not-master": default_branch_is_not_master(repo=repo),
"repo-includes-common-docs": includes_common_docs(repo=repo),
"repo-agg-info-entropy": normalized_total_entropy,
"repo-file-info-entropy": file_entropy,
}


def compute_pr_data(repo_path: str, pr_branch: str, main_branch: str) -> Dict[str, Any]:
Expand Down
12 changes: 12 additions & 0 deletions src/almanack/metrics/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ metrics:
result-type: "tuple"
description: >-
Starting commit and most recent commit for the repository.
- name: "repo-days-of-development"
id: "SGA-META-0005"
result-type: "int"
description: >-
Integer representing the number of days of development
between most recent commit and first commit.
- name: "repo-commits-per-day"
id: "SGA-META-0006"
result-type: "float"
description: >-
Floating point number which represents the number of commits
per day (using days of development).
- name: "repo-includes-readme"
id: "SGA-GL-0001"
result-type: "bool"
Expand Down
16 changes: 10 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,16 @@ def community_health_repository_path(tmp_path_factory):

yield repo_setup(
repo_path=pathlib.Path(temp_dir),
files={
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
},
files=[
{
"files": {
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
}
}
],
)


Expand Down
Loading