From cf4b269660b351e09f090b9e2062f7e04d523f07 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 12 Nov 2024 14:50:32 -0700 Subject: [PATCH 1/8] add and update commit frequency metrics --- pyproject.toml | 1 - src/almanack/git.py | 5 +- src/almanack/metrics/data.py | 166 +++++++++++------- src/almanack/metrics/metrics.yml | 12 ++ tests/conftest.py | 14 +- tests/data/almanack/repo_setup/create_repo.py | 110 ++++++++---- tests/metrics/test_community_health.py | 3 - tests/metrics/test_data.py | 107 ++++++++++- 8 files changed, 300 insertions(+), 118 deletions(-) delete mode 100644 tests/metrics/test_community_health.py diff --git a/pyproject.toml b/pyproject.toml index deda89f7..27924b39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,6 @@ root = "." [tool.ruff] target-version = "py311" fix = true - lint.select = [ # mccabe "C90", diff --git a/src/almanack/git.py b/src/almanack/git.py index 76db560e..6a17a00a 100644 --- a/src/almanack/git.py +++ b/src/almanack/git.py @@ -42,9 +42,8 @@ def get_commits(repo: pygit2.Repository) -> List[pygit2.Commit]: # Get the latest commit (HEAD) from the repository head = repo.revparse_single("HEAD") # Create a walker to iterate over commits starting from the HEAD - walker = repo.walk( - head.id, pygit2.enums.SortMode.NONE - ) # SortMode.NONE traverses commits in natural order; no sorting applied. + # sorting by time. + walker = repo.walk(head.id, pygit2.GIT_SORT_TIME) # Collect all commits from the walker into a list commits = list(walker) return commits diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py index 29bd17df..1d28a20c 100644 --- a/src/almanack/metrics/data.py +++ b/src/almanack/metrics/data.py @@ -193,83 +193,121 @@ def default_branch_is_not_master(repo: pygit2.Repository) -> bool: return repo.head.shorthand != "master" -def compute_repo_data(repo_path: str) -> None: +def days_of_development(repo: pygit2.Repository) -> float: """ - Computes comprehensive data for a GitHub repository. + Args: - repo_path (str): The local path to the Git repository. + repo (pygit2.Repository): Path to the git repository. Returns: - dict: A dictionary containing data key-pairs. + float: The average number of commits per day over the period of time. """ try: - # Convert repo_path to an absolute path and initialize the repository - repo_path = pathlib.Path(repo_path).resolve() - repo = pygit2.Repository(str(repo_path)) + # Try to get the HEAD commit. If it raises an error, there are no commits. + repo.revparse_single("HEAD") + except KeyError: + # If HEAD doesn't exist (repo is empty), return 0 commits. + return 0 + + # Traverse the commit history and collect commit dates + commit_dates = [ + datetime.fromtimestamp(commit.commit_time).date() + for commit in repo.walk(repo.head.target, pygit2.GIT_SORT_TIME) + ] - # Retrieve the list of commits from the repository - commits = get_commits(repo) - most_recent_commit = commits[0] - first_commit = commits[-1] + # If no commits, return 0 + if not commit_dates: + return 0 - # Get a list of files that have been edited between the first and most recent commit - file_names = get_edited_files(repo, first_commit, most_recent_commit) + # Calculate the number of days between the first and last commit + # +1 to include the first day + total_days = (max(commit_dates) - min(commit_dates)).days + 1 - # Calculate the normalized total entropy for the repository - normalized_total_entropy = calculate_aggregate_entropy( - repo_path, - str(first_commit.id), - str(most_recent_commit.id), - file_names, - ) + # Return the average commits per day + return total_days - # Calculate the normalized entropy for the changes between the first and most recent commits - file_entropy = calculate_normalized_entropy( - repo_path, - str(first_commit.id), - str(most_recent_commit.id), - file_names, - ) - # Convert commit times to UTC datetime objects, then format as date strings. - first_commit_date, most_recent_commit_date = ( - datetime.fromtimestamp(commit.commit_time, tz=timezone.utc) - .date() - .isoformat() - for commit in (first_commit, most_recent_commit) - ) - # Return the data structure - return { - "repo-path": str(repo_path), - "repo-commits": len(commits), - "repo-file-count": len(file_names), - "repo-commit-time-range": (first_commit_date, most_recent_commit_date), - "repo-includes-readme": file_exists_in_repo( - repo=repo, - expected_file_name="readme", - ), - "repo-includes-contributing": file_exists_in_repo( - repo=repo, - expected_file_name="contributing", - ), - "repo-includes-code-of-conduct": file_exists_in_repo( - repo=repo, - expected_file_name="code_of_conduct", - ), - "repo-includes-license": file_exists_in_repo( - repo=repo, - expected_file_name="license", - ), - "repo-is-citable": is_citable(repo=repo), - "repo-default-branch-not-master": default_branch_is_not_master(repo=repo), - "repo-agg-info-entropy": normalized_total_entropy, - "repo-file-info-entropy": file_entropy, - } +def compute_repo_data(repo_path: str) -> None: + """ + Computes comprehensive data for a GitHub repository. - except Exception as e: - # If processing fails, return an error dictionary - return {"repo_path": str(repo_path), "error": str(e)} + Args: + repo_path (str): The local path to the Git repository. + + Returns: + dict: A dictionary containing data key-pairs. + """ + # Convert repo_path to an absolute path and initialize the repository + repo_path = pathlib.Path(repo_path).resolve() + repo = pygit2.Repository(str(repo_path)) + + # Retrieve the list of commits from the repository + commits = get_commits(repo) + most_recent_commit = commits[0] + first_commit = commits[-1] + + # Get a list of files that have been edited between the first and most recent commit + edited_file_names = get_edited_files(repo, first_commit, most_recent_commit) + + # Calculate the normalized total entropy for the repository + normalized_total_entropy = calculate_aggregate_entropy( + repo_path, + str(first_commit.id), + str(most_recent_commit.id), + edited_file_names, + ) + + # Calculate the normalized entropy for the changes between the first and most recent commits + file_entropy = calculate_normalized_entropy( + repo_path, + str(first_commit.id), + str(most_recent_commit.id), + edited_file_names, + ) + # Convert commit times to UTC datetime objects, then format as date strings. + first_commit_date, most_recent_commit_date = ( + datetime.fromtimestamp(commit.commit_time).date() + for commit in (first_commit, most_recent_commit) + ) + + # Return the data structure + return { + "repo-path": str(repo_path), + "repo-commits": (commits_count := len(commits)), + "repo-file-count": sum( + 1 for entry in most_recent_commit.tree if isinstance(entry, pygit2.Blob) + ), + "repo-commit-time-range": ( + first_commit_date.isoformat(), + most_recent_commit_date.isoformat(), + ), + "repo-days-of-development": ( + days_of_development := (most_recent_commit_date - first_commit_date).days + + 1 + ), + "repo-commits-per-day": commits_count / days_of_development, + "repo-includes-readme": file_exists_in_repo( + repo=repo, + expected_file_name="readme", + ), + "repo-includes-contributing": file_exists_in_repo( + repo=repo, + expected_file_name="contributing", + ), + "repo-includes-code-of-conduct": file_exists_in_repo( + repo=repo, + expected_file_name="code_of_conduct", + ), + "repo-includes-license": file_exists_in_repo( + repo=repo, + expected_file_name="license", + ), + "repo-is-citable": is_citable(repo=repo), + "repo-default-branch-not-master": default_branch_is_not_master(repo=repo), + "repo-agg-info-entropy": normalized_total_entropy, + "repo-file-info-entropy": file_entropy, + } def compute_pr_data(repo_path: str, pr_branch: str, main_branch: str) -> Dict[str, Any]: diff --git a/src/almanack/metrics/metrics.yml b/src/almanack/metrics/metrics.yml index 63df079b..356642e3 100644 --- a/src/almanack/metrics/metrics.yml +++ b/src/almanack/metrics/metrics.yml @@ -20,6 +20,18 @@ metrics: result-type: "tuple" description: >- Starting commit and most recent commit for the repository. + - name: "repo-days-of-development" + id: "SGA-META-0005" + result-type: "int" + description: >- + Integer representing the number of days of development + between most recent commit and first commit. + - name: "repo-commits-per-day" + id: "SGA-META-0006" + result-type: "float" + description: >- + Floating point number which represents the number of commits + per day (using days of development). - name: "repo-includes-readme" id: "SGA-GL-0001" result-type: "bool" diff --git a/tests/conftest.py b/tests/conftest.py index 5989f2d2..399a593a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -118,12 +118,14 @@ def community_health_repository_path(tmp_path_factory): yield repo_setup( repo_path=pathlib.Path(temp_dir), - files={ - "README.md": "# This is an example readme\n\nWelcome to our repo!", - "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", - "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", - "LICENSE.txt": "This is an example LICENSE file.", - }, + files=[ + { + "README.md": "# This is an example readme\n\nWelcome to our repo!", + "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", + "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", + "LICENSE.txt": "This is an example LICENSE file.", + } + ], ) diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index b652abd4..2a06bf19 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -4,6 +4,8 @@ """ import pathlib +from datetime import datetime +from typing import Optional import pygit2 @@ -156,51 +158,91 @@ def create_entropy_repositories(base_path: pathlib.Path) -> None: def repo_setup( - repo_path: pathlib.Path, files: dict, branch_name: str = "main" + repo_path: pathlib.Path, + files: list[dict], + branch_name: str = "main", + dates: Optional[list[datetime]] = None, ) -> pygit2.Repository: """ - Set up a temporary repository with specified files. + Set up a temporary repository with specified files and commit dates. Args: - tmp_path (Path): - The temporary directory where the repo will be created. - files (dict): - A dictionary where keys are filenames and values are their content. - branch_name (str): - A string with the name of the branch which will be used for - committing changes. Defaults to "main". + repo_path (Path): The temporary directory where the repo will be created. + files (list[dict]): A list of dictionaries where each dictionary represents a commit + and contains filenames as keys and file content as values. + branch_name (str): The name of the branch to use for commits. Defaults to "main". + dates (list[datetime], optional): A list of commit dates corresponding to each commit. + If None, all commits will use the current date. Returns: - pygit2.Repository: The initialized repository with files. + pygit2.Repository: The initialized repository with the specified commits. """ - # Create a new repository in the temporary path + # Initialize the repository repo = pygit2.init_repository(repo_path, bare=False) # Set user.name and user.email in the config set_repo_user_config(repo) - # Create files in the repository - for filename, content in files.items(): - (repo_path / filename).write_text(content) - - # Stage and commit the files - index = repo.index - index.add_all() - index.write() - - author = repo.default_signature - tree = repo.index.write_tree() - - repo.create_commit( - f"refs/heads/{branch_name}", - author, - author, - "Initial commit with setup files", - tree, - [], - ) - - # Set the head to the main branch - repo.set_head(f"refs/heads/{branch_name}") + # Use current date if no specific dates are provided + if dates is None: + dates = [datetime.now()] * len(files) + + # Ensure dates match the number of commits + assert len(dates) == len( + files + ), "Length of dates must match the number of commit dictionaries in files" + + branch_ref = f"refs/heads/{branch_name}" + parent_commit = None + + # Loop through each set of files and commit them + for i, (commit_files, commit_date) in enumerate(zip(files, dates)): + + # Create or update each file in the current commit + for filename, content in commit_files.items(): + file_path = repo_path / filename + file_path.write_text(content) + + # Stage all changes in the index + index = repo.index + index.add_all() + index.write() + + # Set the author and committer signatures with the specific commit date + author = pygit2.Signature( + repo.default_signature.name, + repo.default_signature.email, + int(commit_date.timestamp()), + ) + committer = author # Assuming the committer is the same as the author + + # Write the index to a tree + tree = index.write_tree() + + # Create the commit + commit_message = f"Commit #{i + 1} with files: {', '.join(commit_files.keys())}" + commit_id = repo.create_commit( + ( + branch_ref if i == 0 else None + ), # Set branch reference only for the first commit + author, + committer, + commit_message, + tree, + ( + [parent_commit.id] if parent_commit else [] + ), # Use the .id attribute to get the commit ID + ) + + # Update the parent_commit to the latest commit for chaining + parent_commit = repo.get( + commit_id + ) # Explicitly get the Commit object by its ID + + # Set the HEAD to the main branch after all commits + repo.set_head(branch_ref) + + # Ensure the HEAD is pointing to the last commit + repo.head.set_target(parent_commit.id) return repo diff --git a/tests/metrics/test_community_health.py b/tests/metrics/test_community_health.py deleted file mode 100644 index 928b94df..00000000 --- a/tests/metrics/test_community_health.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Tests various community health metric functionality. -""" diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index aa80a360..af34d903 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -3,7 +3,8 @@ """ import pathlib -from typing import List +from datetime import datetime, timedelta +from typing import Dict, List, Optional import jsonschema import pandas as pd @@ -218,7 +219,7 @@ def test_is_citable(tmp_path, files, expected): """ if files is not None: - repo = repo_setup(repo_path=tmp_path, files=files) + repo = repo_setup(repo_path=tmp_path, files=[files]) else: # test the almanack itself repo_path = pathlib.Path(".").resolve() @@ -242,21 +243,21 @@ def test_default_branch_is_not_master(tmp_path): # test with a master branch repo = repo_setup( - repo_path=example1, files={"example.txt": "example"}, branch_name="master" + repo_path=example1, files=[{"example.txt": "example"}], branch_name="master" ) assert not default_branch_is_not_master(repo) # test with a main branch repo = repo_setup( - repo_path=example2, files={"example.txt": "example"}, branch_name="main" + repo_path=example2, files=[{"example.txt": "example"}], branch_name="main" ) assert default_branch_is_not_master(repo) # test with a simulated remote head pointed at remote master repo = repo_setup( - repo_path=example3, files={"example.txt": "example"}, branch_name="main" + repo_path=example3, files=[{"example.txt": "example"}], branch_name="main" ) # simulate having a remote head pointed at a branch named master @@ -275,7 +276,7 @@ def test_default_branch_is_not_master(tmp_path): # test with a simulated remote head pointed at remote main repo = repo_setup( - repo_path=example4, files={"example.txt": "example"}, branch_name="main" + repo_path=example4, files=[{"example.txt": "example"}], branch_name="main" ) # simulate having a remote head pointed at a branch named master @@ -294,7 +295,7 @@ def test_default_branch_is_not_master(tmp_path): # test with a simulated remote head pointed at remote main but with local branch master repo = repo_setup( - repo_path=example5, files={"example.txt": "example"}, branch_name="master" + repo_path=example5, files=[{"example.txt": "example"}], branch_name="master" ) # simulate having a remote head pointed at a branch named master @@ -306,3 +307,95 @@ def test_default_branch_is_not_master(tmp_path): ) assert not default_branch_is_not_master(repo) + + +@pytest.mark.parametrize( + "files, dates, expected_commits, expected_file_count, expected_days, expected_commits_per_day", + [ + # Single commit on a single day with one file + ([{"file1.txt": "content"}], None, 1, 1, 1, 1.0), + # Two commits on the same day with two files + ([{"file1.txt": "content"}, {"file2.txt": "content"}], None, 2, 2, 1, 2.0), + # Multiple commits over multiple days + ( + [ + {"file1.txt": "content"}, + {"file2.txt": "content"}, + {"file3.txt": "content"}, + ], + [ + datetime.now() - timedelta(days=2), + datetime.now() - timedelta(days=1), + datetime.now(), + ], + 3, + 3, + 3, + 1.0, + ), + # Multiple commits on the same day with multiple files + ( + [ + {"file1.txt": "content"}, + {"file2.txt": "new content"}, + {"file3.txt": "another content"}, + ], + [datetime.now()] * 3, + 3, + 3, + 1, + 3.0, + ), + ], +) +# add noqa rule below to avoid warnings about too many parameters +def test_commit_frequency_data( # noqa: PLR0913 + tmp_path: pathlib.Path, + files: List[Dict[str, str]], + dates: Optional[List[datetime]], + expected_commits: int, + expected_file_count: int, + expected_days: int, + expected_commits_per_day: float, +): + """ + Tests to ensure metric keys surrounding commits and commit frequency are + working as expected. + """ + # Setup the repository with the provided file structure and dates + repo_setup(tmp_path, files, dates=dates) + + # Run the function to compute repo data + repo_data = compute_repo_data(str(tmp_path)) + + # Assertions for repo-commits + assert ( + repo_data["repo-commits"] == expected_commits + ), f"Expected {expected_commits} commits, got {repo_data['repo-commits']}" + + # Assertions for repo-file-count + assert ( + repo_data["repo-file-count"] == expected_file_count + ), f"Expected {expected_file_count} files, got {repo_data['repo-file-count']}" + + # Assertions for repo-commit-time-range + if dates: + first_date = dates[0].date().isoformat() + last_date = dates[-1].date().isoformat() + else: + today = datetime.now().date().isoformat() + first_date = last_date = today + assert repo_data["repo-commit-time-range"] == ( + first_date, + last_date, + ), f"Expected commit time range ({first_date}, {last_date}), got {repo_data['repo-commit-time-range']}" + + # Assertions for repo-days-of-development + assert ( + repo_data["repo-days-of-development"] == expected_days + ), f"Expected {expected_days} days of development, got {repo_data['repo-days-of-development']}" + + # Assertions for repo-commits-per-day + assert ( + repo_data["repo-commits-per-day"] == expected_commits_per_day + ), f"Expected {expected_commits_per_day} commits per day, got {repo_data['repo-commits-per-day']}" From a63d4c24f576470033f2b116bf52dc86cae495c0 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 12 Nov 2024 14:56:31 -0700 Subject: [PATCH 2/8] docs formatting --- tests/data/almanack/repo_setup/create_repo.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index 2a06bf19..c5be542d 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -167,15 +167,20 @@ def repo_setup( Set up a temporary repository with specified files and commit dates. Args: - repo_path (Path): The temporary directory where the repo will be created. - files (list[dict]): A list of dictionaries where each dictionary represents a commit - and contains filenames as keys and file content as values. - branch_name (str): The name of the branch to use for commits. Defaults to "main". - dates (list[datetime], optional): A list of commit dates corresponding to each commit. - If None, all commits will use the current date. + repo_path (Path): + The temporary directory where the repo will be created. + files (list[dict]): + A list of dictionaries where each dictionary represents a commit + and contains filenames as keys and file content as values. + branch_name (str): + The name of the branch to use for commits. Defaults to "main". + dates (list[datetime], optional): + A list of commit dates corresponding to each commit. + If None, all commits will use the current date. Returns: - pygit2.Repository: The initialized repository with the specified commits. + pygit2.Repository: + The initialized repository with the specified commits. """ # Initialize the repository repo = pygit2.init_repository(repo_path, bare=False) From 44c7f3bf9622346e5ec9d7a722cf2e75af40eece Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 12 Nov 2024 17:03:02 -0700 Subject: [PATCH 3/8] correct the counting of files --- src/almanack/git.py | 39 ++++++++++---- src/almanack/metrics/data.py | 12 +++-- tests/data/almanack/repo_setup/create_repo.py | 4 +- tests/test_git.py | 54 ++++++++++++++++++- 4 files changed, 94 insertions(+), 15 deletions(-) diff --git a/src/almanack/git.py b/src/almanack/git.py index 6a17a00a..1284a014 100644 --- a/src/almanack/git.py +++ b/src/almanack/git.py @@ -4,7 +4,7 @@ import pathlib import tempfile -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import pygit2 from charset_normalizer import from_bytes @@ -146,14 +146,6 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]: return str(source_commit.id), str(target_commit.id) -""" -Module for handling various tasks with git repo blobs. -""" - - -import pygit2 - - def detect_encoding(blob_data: bytes) -> str: """ Detect the encoding of the given blob data using charset-normalizer. @@ -212,3 +204,32 @@ def find_and_read_file(repo: pygit2.Repository, filename: str) -> Optional[str]: # Decode and return content as a string return blob_data.decode(detect_encoding(blob_data)) + + +def count_files(tree: Union[pygit2.Tree, pygit2.Blob]) -> int: + """ + Counts all files (Blobs) within a Git tree, including files + in subdirectories. + + This function recursively traverses the provided `tree` + object to count each file, represented as a `pygit2.Blob`, + within the tree and any nested subdirectories. + + Args: + tree (Union[pygit2.Tree, pygit2.Blob]): + The Git tree object (of type `pygit2.Tree`) + to traverse and count files. The initial call + should be made with the root tree of a commit. + + Returns: + int: + The total count of files (Blobs) within the tree, + including nested files in subdirectories. + """ + file_count = 0 + for entry in tree: + if isinstance(entry, pygit2.Blob): + file_count += 1 + elif isinstance(entry, pygit2.Tree): + file_count += count_files(entry) # Recurse into subdirectory + return file_count diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py index 1d28a20c..35e9fb70 100644 --- a/src/almanack/metrics/data.py +++ b/src/almanack/metrics/data.py @@ -11,7 +11,13 @@ import pygit2 import yaml -from ..git import clone_repository, find_and_read_file, get_commits, get_edited_files +from ..git import ( + clone_repository, + count_files, + find_and_read_file, + get_commits, + get_edited_files, +) from .entropy.calculate_entropy import ( calculate_aggregate_entropy, calculate_normalized_entropy, @@ -275,9 +281,7 @@ def compute_repo_data(repo_path: str) -> None: return { "repo-path": str(repo_path), "repo-commits": (commits_count := len(commits)), - "repo-file-count": sum( - 1 for entry in most_recent_commit.tree if isinstance(entry, pygit2.Blob) - ), + "repo-file-count": count_files(tree=most_recent_commit.tree), "repo-commit-time-range": ( first_commit_date.isoformat(), most_recent_commit_date.isoformat(), diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index c5be542d..26c4eda4 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -202,10 +202,12 @@ def repo_setup( # Loop through each set of files and commit them for i, (commit_files, commit_date) in enumerate(zip(files, dates)): - # Create or update each file in the current commit for filename, content in commit_files.items(): file_path = repo_path / filename + file_path.parent.mkdir( + parents=True, exist_ok=True + ) # Ensure parent directories exist file_path.write_text(content) # Stage all changes in the index diff --git a/tests/test_git.py b/tests/test_git.py index 654cb7a5..87174289 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -3,13 +3,14 @@ """ import pathlib -from typing import Any +from typing import Any, Dict, List import pygit2 import pytest from almanack.git import ( clone_repository, + count_files, detect_encoding, find_and_read_file, get_commits, @@ -17,6 +18,7 @@ get_loc_changed, get_most_recent_commits, ) +from tests.data.almanack.repo_setup.create_repo import repo_setup def test_clone_repository(entropy_repository_paths: dict[str, Any]): @@ -147,3 +149,53 @@ def test_find_and_read_file(repo_with_citation_in_readme, filename, expected_con assert ( result == expected_content ) # Expecting the actual content for found files + + +@pytest.mark.parametrize( + "files, expected_count", + [ + # Test case: Single file at root + ([{"file1.txt": "content"}], 1), + # Test case: Multiple files at root + ([{"file1.txt": "content", "file2.txt": "content"}], 2), + # Test case: Files in nested directories + ([{"dir1/file1.txt": "content", "dir1/dir2/file2.txt": "content"}], 2), + # Test case: Empty repository (no files) + ([{}], 0), + # Test case: Mixed root and nested files + ( + [ + { + "file1.txt": "content", + "dir1/file2.txt": "content", + "dir1/dir2/file3.txt": "content", + } + ], + 3, + ), + ], +) +def test_count_files( + files: List[Dict[str, str]], expected_count: int, tmp_path: pathlib.Path +): + """ + Test the count_files function on various repository structures. + + Args: + files (List[Dict[str, str]]): A list of dictionaries where each dictionary represents a commit + and contains filenames as keys and file content as values. + expected_count (int): The expected number of files in the most recent commit tree. + tmp_path (pathlib.Path): Temporary directory path provided by pytest for testing. + """ + # Set up the test repository + repo_path = tmp_path / "test_repo" + repo = repo_setup(repo_path, files=files) + + # Get the most recent commit and its tree + most_recent_commit = next(repo.walk(repo.head.target, pygit2.GIT_SORT_TIME)) + most_recent_tree = most_recent_commit.tree + + # Run the count_files function and assert the file count matches the expected count + assert ( + count_files(most_recent_tree) == expected_count + ), f"Expected {expected_count} files, got {count_files(most_recent_tree)}" From 377290544ddf5eb8990b9e18a0376e7d592bb6ed Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 12 Nov 2024 21:46:45 -0700 Subject: [PATCH 4/8] fix cli test --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index cf06992f..9b64f615 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -28,7 +28,7 @@ def test_cli_almanack(tmp_path): """ # create a repo with a single file and commit - repo = repo_setup(repo_path=tmp_path, files={"example.txt": "example"}) + repo = repo_setup(repo_path=tmp_path, files=[{"example.txt": "example"}]) # gather output and return code from running a CLI command stdout, _, returncode = run_cli_command(command=["almanack", repo.path]) From 8b885d53d58d645e1612a43d237d9c07869af570 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 21 Nov 2024 11:19:11 -0700 Subject: [PATCH 5/8] update count_files Co-Authored-By: Faisal Alquaddoomi --- src/almanack/git.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/almanack/git.py b/src/almanack/git.py index 1284a014..f966aac6 100644 --- a/src/almanack/git.py +++ b/src/almanack/git.py @@ -226,10 +226,9 @@ def count_files(tree: Union[pygit2.Tree, pygit2.Blob]) -> int: The total count of files (Blobs) within the tree, including nested files in subdirectories. """ - file_count = 0 - for entry in tree: - if isinstance(entry, pygit2.Blob): - file_count += 1 - elif isinstance(entry, pygit2.Tree): - file_count += count_files(entry) # Recurse into subdirectory - return file_count + if isinstance(tree, pygit2.Blob): + # Directly return 1 if the input is a Blob + return 1 + elif isinstance(tree, pygit2.Tree): + # Recursively count files for Tree + return sum(count_files(entry) for entry in tree) From f0724fe1d35acbeb0d406dd6216a0fdc64e05655 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 21 Nov 2024 11:44:35 -0700 Subject: [PATCH 6/8] include commit date unified w/ files in test repos Co-Authored-By: Faisal Alquaddoomi --- tests/conftest.py | 10 +- tests/data/almanack/repo_setup/create_repo.py | 30 ++--- tests/metrics/test_data.py | 108 +++++++++++------- tests/test_cli.py | 2 +- tests/test_git.py | 26 +++-- 5 files changed, 103 insertions(+), 73 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 399a593a..666fe808 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,10 +120,12 @@ def community_health_repository_path(tmp_path_factory): repo_path=pathlib.Path(temp_dir), files=[ { - "README.md": "# This is an example readme\n\nWelcome to our repo!", - "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", - "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", - "LICENSE.txt": "This is an example LICENSE file.", + "files": { + "README.md": "# This is an example readme\n\nWelcome to our repo!", + "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", + "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", + "LICENSE.txt": "This is an example LICENSE file.", + } } ], ) diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index 26c4eda4..93dee53e 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -5,7 +5,6 @@ import pathlib from datetime import datetime -from typing import Optional import pygit2 @@ -161,7 +160,6 @@ def repo_setup( repo_path: pathlib.Path, files: list[dict], branch_name: str = "main", - dates: Optional[list[datetime]] = None, ) -> pygit2.Repository: """ Set up a temporary repository with specified files and commit dates. @@ -170,13 +168,14 @@ def repo_setup( repo_path (Path): The temporary directory where the repo will be created. files (list[dict]): - A list of dictionaries where each dictionary represents a commit - and contains filenames as keys and file content as values. + A list of dictionaries where each dictionary represents a commit. + Each dictionary must have: + - "files": A dictionary of filenames as keys and file content as values. + - "commit-date" (optional): The datetime of the commit. + If "commit-date" is not provided, the current date is used. + branch_name (str): The name of the branch to use for commits. Defaults to "main". - dates (list[datetime], optional): - A list of commit dates corresponding to each commit. - If None, all commits will use the current date. Returns: pygit2.Repository: @@ -188,20 +187,15 @@ def repo_setup( # Set user.name and user.email in the config set_repo_user_config(repo) - # Use current date if no specific dates are provided - if dates is None: - dates = [datetime.now()] * len(files) - - # Ensure dates match the number of commits - assert len(dates) == len( - files - ), "Length of dates must match the number of commit dictionaries in files" - branch_ref = f"refs/heads/{branch_name}" parent_commit = None - # Loop through each set of files and commit them - for i, (commit_files, commit_date) in enumerate(zip(files, dates)): + # Loop through each commit dictionary in `files` + for i, commit_data in enumerate(files): + # Extract commit files and commit date + commit_files = commit_data.get("files", {}) + commit_date = commit_data.get("commit-date", datetime.now()) + # Create or update each file in the current commit for filename, content in commit_files.items(): file_path = repo_path / filename diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index af34d903..3269566d 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -4,7 +4,7 @@ import pathlib from datetime import datetime, timedelta -from typing import Dict, List, Optional +from typing import Dict, List import jsonschema import pandas as pd @@ -22,6 +22,8 @@ ) from tests.data.almanack.repo_setup.create_repo import repo_setup +DATETIME_NOW = datetime.now() + def test_generate_repo_data(entropy_repository_paths: dict[str, pathlib.Path]) -> None: """ @@ -173,42 +175,44 @@ def test_file_exists_in_repo( "files, expected", [ # Test with CITATION.cff - ({"CITATION.cff": "CITATION content."}, True), + ({"files": {"CITATION.cff": "CITATION content."}}, True), # Test with CITATION.bib - ({"CITATION.bib": "CITATION content."}, True), + ({"files": {"CITATION.bib": "CITATION content."}}, True), # Test citation sections in markdown format ( - {"readme.md": "## Citation\nThis is a citation."}, + {"files": {"readme.md": "## Citation\nThis is a citation."}}, True, ), ( - {"readme.md": "## Citing us\n\nHere's our awesome citation."}, + {"files": {"readme.md": "## Citing us\n\nHere's our awesome citation."}}, True, ), # RST scenarios - ({"README.md": "Citation\n--------"}, True), - ({"README.md": "Citing\n------"}, True), - ({"README.md": "Cite\n----"}, True), - ({"README.md": "How to cite\n-----------"}, True), + ({"files": {"README.md": "Citation\n--------"}}, True), + ({"files": {"README.md": "Citing\n------"}}, True), + ({"files": {"README.md": "Cite\n----"}}, True), + ({"files": {"README.md": "How to cite\n-----------"}}, True), # DOI badge ( { - "README.md": ( - "# Awesome project\n\n" - "[![DOI](https://img.shields.io/badge/DOI-10.48550/arXiv.2311.13417-blue)]" - "(https://doi.org/10.48550/arXiv.2311.13417)" - ), + "files": { + "README.md": ( + "# Awesome project\n\n" + "[![DOI](https://img.shields.io/badge/DOI-10.48550/arXiv.2311.13417-blue)]" + "(https://doi.org/10.48550/arXiv.2311.13417)" + ), + } }, True, ), - ({"README.md": "## How to cite"}, True), + ({"files": {"README.md": "## How to cite"}}, True), # Test with README without citation ( - {"readme.md": "This is a readme."}, + {"files": {"readme.md": "This is a readme."}}, False, ), # Test with no citation files - ({"random.txt": "Some random text."}, False), + ({"files": {"random.txt": "Some random text."}}, False), # test the almanack itseft as a special case (None, True), ], @@ -243,21 +247,27 @@ def test_default_branch_is_not_master(tmp_path): # test with a master branch repo = repo_setup( - repo_path=example1, files=[{"example.txt": "example"}], branch_name="master" + repo_path=example1, + files=[{"files": {"example.txt": "example"}}], + branch_name="master", ) assert not default_branch_is_not_master(repo) # test with a main branch repo = repo_setup( - repo_path=example2, files=[{"example.txt": "example"}], branch_name="main" + repo_path=example2, + files=[{"files": {"example.txt": "example"}}], + branch_name="main", ) assert default_branch_is_not_master(repo) # test with a simulated remote head pointed at remote master repo = repo_setup( - repo_path=example3, files=[{"example.txt": "example"}], branch_name="main" + repo_path=example3, + files=[{"files": {"example.txt": "example"}}], + branch_name="main", ) # simulate having a remote head pointed at a branch named master @@ -276,7 +286,9 @@ def test_default_branch_is_not_master(tmp_path): # test with a simulated remote head pointed at remote main repo = repo_setup( - repo_path=example4, files=[{"example.txt": "example"}], branch_name="main" + repo_path=example4, + files=[{"files": {"example.txt": "example"}}], + branch_name="main", ) # simulate having a remote head pointed at a branch named master @@ -295,7 +307,9 @@ def test_default_branch_is_not_master(tmp_path): # test with a simulated remote head pointed at remote main but with local branch master repo = repo_setup( - repo_path=example5, files=[{"example.txt": "example"}], branch_name="master" + repo_path=example5, + files=[{"files": {"example.txt": "example"}}], + branch_name="master", ) # simulate having a remote head pointed at a branch named master @@ -310,23 +324,30 @@ def test_default_branch_is_not_master(tmp_path): @pytest.mark.parametrize( - "files, dates, expected_commits, expected_file_count, expected_days, expected_commits_per_day", + "files, expected_commits, expected_file_count, expected_days, expected_commits_per_day", [ # Single commit on a single day with one file - ([{"file1.txt": "content"}], None, 1, 1, 1, 1.0), + ([{"files": {"file1.txt": "content"}}], 1, 1, 1, 1.0), # Two commits on the same day with two files - ([{"file1.txt": "content"}, {"file2.txt": "content"}], None, 2, 2, 1, 2.0), + ( + [{"files": {"file1.txt": "content"}}, {"files": {"file2.txt": "content"}}], + 2, + 2, + 1, + 2.0, + ), # Multiple commits over multiple days ( [ - {"file1.txt": "content"}, - {"file2.txt": "content"}, - {"file3.txt": "content"}, - ], - [ - datetime.now() - timedelta(days=2), - datetime.now() - timedelta(days=1), - datetime.now(), + { + "commit-date": DATETIME_NOW - timedelta(days=2), + "files": {"file1.txt": "content"}, + }, + { + "commit-date": DATETIME_NOW - timedelta(days=1), + "files": {"file2.txt": "content"}, + }, + {"commit-date": DATETIME_NOW, "files": {"file3.txt": "content"}}, ], 3, 3, @@ -336,11 +357,13 @@ def test_default_branch_is_not_master(tmp_path): # Multiple commits on the same day with multiple files ( [ - {"file1.txt": "content"}, - {"file2.txt": "new content"}, - {"file3.txt": "another content"}, + {"commit-date": DATETIME_NOW, "files": {"file1.txt": "content"}}, + {"commit-date": DATETIME_NOW, "files": {"file2.txt": "new content"}}, + { + "commit-date": DATETIME_NOW, + "files": {"file3.txt": "another content"}, + }, ], - [datetime.now()] * 3, 3, 3, 1, @@ -352,7 +375,6 @@ def test_default_branch_is_not_master(tmp_path): def test_commit_frequency_data( # noqa: PLR0913 tmp_path: pathlib.Path, files: List[Dict[str, str]], - dates: Optional[List[datetime]], expected_commits: int, expected_file_count: int, expected_days: int, @@ -363,7 +385,7 @@ def test_commit_frequency_data( # noqa: PLR0913 working as expected. """ # Setup the repository with the provided file structure and dates - repo_setup(tmp_path, files, dates=dates) + repo_setup(repo_path=tmp_path, files=files) # Run the function to compute repo data repo_data = compute_repo_data(str(tmp_path)) @@ -379,11 +401,11 @@ def test_commit_frequency_data( # noqa: PLR0913 ), f"Expected {expected_file_count} files, got {repo_data['repo-file-count']}" # Assertions for repo-commit-time-range - if dates: - first_date = dates[0].date().isoformat() - last_date = dates[-1].date().isoformat() + if "commit-date" in files[0].keys(): + first_date = files[0]["commit-date"].date().isoformat() + last_date = files[-1]["commit-date"].date().isoformat() else: - today = datetime.now().date().isoformat() + today = DATETIME_NOW.date().isoformat() first_date = last_date = today assert repo_data["repo-commit-time-range"] == ( first_date, diff --git a/tests/test_cli.py b/tests/test_cli.py index 9b64f615..41404607 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -28,7 +28,7 @@ def test_cli_almanack(tmp_path): """ # create a repo with a single file and commit - repo = repo_setup(repo_path=tmp_path, files=[{"example.txt": "example"}]) + repo = repo_setup(repo_path=tmp_path, files=[{"files": {"example.txt": "example"}}]) # gather output and return code from running a CLI command stdout, _, returncode = run_cli_command(command=["almanack", repo.path]) diff --git a/tests/test_git.py b/tests/test_git.py index 87174289..27d454f2 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -155,20 +155,32 @@ def test_find_and_read_file(repo_with_citation_in_readme, filename, expected_con "files, expected_count", [ # Test case: Single file at root - ([{"file1.txt": "content"}], 1), + ([{"files": {"file1.txt": "content"}}], 1), # Test case: Multiple files at root - ([{"file1.txt": "content", "file2.txt": "content"}], 2), + ([{"files": {"file1.txt": "content", "file2.txt": "content"}}], 2), # Test case: Files in nested directories - ([{"dir1/file1.txt": "content", "dir1/dir2/file2.txt": "content"}], 2), + ( + [ + { + "files": { + "dir1/file1.txt": "content", + "dir1/dir2/file2.txt": "content", + } + } + ], + 2, + ), # Test case: Empty repository (no files) - ([{}], 0), + ([{"files": {}}], 0), # Test case: Mixed root and nested files ( [ { - "file1.txt": "content", - "dir1/file2.txt": "content", - "dir1/dir2/file3.txt": "content", + "files": { + "file1.txt": "content", + "dir1/file2.txt": "content", + "dir1/dir2/file3.txt": "content", + } } ], 3, From 7decad43d37346caa5f5e3db418af9ccf1c5a877 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 21 Nov 2024 11:59:27 -0700 Subject: [PATCH 7/8] fix docs test --- tests/metrics/test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index d3c48e84..155d131b 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -483,7 +483,7 @@ def test_includes_common_docs(tmp_path, files, expected_result): Tests includes_common_docs """ if files is not None: - repo = repo_setup(repo_path=tmp_path, files=files) + repo = repo_setup(repo_path=tmp_path, files=[files]) else: # test the almanack itself repo_path = pathlib.Path(".").resolve() From 2cbee8206157ddbacff423af68fc1cd50fb13a63 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 21 Nov 2024 11:59:43 -0700 Subject: [PATCH 8/8] linting --- src/almanack/git.py | 1 + src/almanack/metrics/data.py | 11 +++++- tests/metrics/test_data.py | 73 ++++++++++++++++++++++-------------- tests/test_git.py | 1 + 4 files changed, 56 insertions(+), 30 deletions(-) diff --git a/src/almanack/git.py b/src/almanack/git.py index 98ffb35c..3c39f7b6 100644 --- a/src/almanack/git.py +++ b/src/almanack/git.py @@ -245,6 +245,7 @@ def count_files(tree: Union[pygit2.Tree, pygit2.Blob]) -> int: # Recursively count files for Tree return sum(count_files(entry) for entry in tree) + def read_file( repo: pygit2.Repository, entry: Optional[pygit2.Object] = None, diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py index ce660571..0986546a 100644 --- a/src/almanack/metrics/data.py +++ b/src/almanack/metrics/data.py @@ -11,7 +11,14 @@ import pygit2 import yaml -from ..git import clone_repository, find_file, get_commits, get_edited_files, read_file, count_files +from ..git import ( + clone_repository, + count_files, + find_file, + get_commits, + get_edited_files, + read_file, +) from .entropy.calculate_entropy import ( calculate_aggregate_entropy, calculate_normalized_entropy, @@ -229,6 +236,7 @@ def days_of_development(repo: pygit2.Repository) -> float: # Return the average commits per day return total_days + def includes_common_docs(repo: pygit2.Repository) -> bool: """ Check whether the repo includes common documentation files and directories @@ -267,6 +275,7 @@ def includes_common_docs(repo: pygit2.Repository) -> bool: # otherwise return false as we didn't find documentation return False + def compute_repo_data(repo_path: str) -> None: """ Computes comprehensive data for a GitHub repository. diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index 155d131b..73738a0e 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -416,62 +416,77 @@ def test_commit_frequency_data( # noqa: PLR0913 assert ( repo_data["repo-commits-per-day"] == expected_commits_per_day ), f"Expected {expected_commits_per_day} commits per day, got {repo_data['repo-commits-per-day']}" - - -@pytest.mark.parametrize("files, expected_result", + + +@pytest.mark.parametrize( + "files, expected_result", [ # Scenario 1: `docs` directory with common documentation files ( - {"files":{ - "docs/mkdocs.yml": "site_name: Test Docs", - "docs/index.md": "# Welcome to the documentation", - "README.md": "# Project Overview", - }}, + { + "files": { + "docs/mkdocs.yml": "site_name: Test Docs", + "docs/index.md": "# Welcome to the documentation", + "README.md": "# Project Overview", + } + }, True, ), # Scenario 2: `docs` directory without common documentation files ( - {"files":{ - "docs/random_file.txt": "This is just a random file", - "README.md": "# Project Overview", - }}, + { + "files": { + "docs/random_file.txt": "This is just a random file", + "README.md": "# Project Overview", + } + }, False, ), # Scenario 3: No `docs` directory ( - {"files":{ - "README.md": "# Project Overview", - "src/main.py": "# Main script", - }}, + { + "files": { + "README.md": "# Project Overview", + "src/main.py": "# Main script", + } + }, False, ), # Scenario 4: `docs` directory with misleading names ( - {"files":{ - "docs/mkdoc.yml": "Not a valid mkdocs file", - "docs/INDEX.md": "# Not a documentation index", - }}, + { + "files": { + "docs/mkdoc.yml": "Not a valid mkdocs file", + "docs/INDEX.md": "# Not a documentation index", + } + }, False, ), # Scenario 5: `docs` directory with sphinx-like structure ( - {"files":{ - "docs/source/index.rst": "An rst index", - }}, + { + "files": { + "docs/source/index.rst": "An rst index", + } + }, True, ), # Scenario 6: `docs` directory with sphinx-like structure ( - {"files":{ - "docs/source/index.md": "An md index", - }}, + { + "files": { + "docs/source/index.md": "An md index", + } + }, True, ), # Scenario 6: `docs` directory with a readme under source dir ( - {"files":{ - "docs/source/readme.md": "A readme for nested docs", - }}, + { + "files": { + "docs/source/readme.md": "A readme for nested docs", + } + }, True, ), # test the almanack itseft as a special case diff --git a/tests/test_git.py b/tests/test_git.py index 3628b18b..24321062 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -166,6 +166,7 @@ def test_find_file_and_read_file( assert read_file_result_filepath == expected_content assert read_file_result_filepath == read_file_result_pygit_obj + @pytest.mark.parametrize( "files, expected_count", [