Skip to content

Commit

Permalink
Add metric for checking common documentation paths (#164)
Browse files Browse the repository at this point in the history
* add docs metric

* add docs readme link

* update test case for docs

* add nested dir tests and capability

* update docstrings

* Update git.py

Co-Authored-By: Faisal Alquaddoomi <[email protected]>

* move to read_file and find_file

Co-Authored-By: Faisal Alquaddoomi <[email protected]>

* revert to old docstring for test function

* Update tests/metrics/test_data.py

Co-authored-by: Faisal Alquaddoomi <[email protected]>

* linting

---------

Co-authored-by: Faisal Alquaddoomi <[email protected]>
Co-authored-by: Faisal Alquaddoomi <[email protected]>
  • Loading branch information
3 people authored Nov 13, 2024
1 parent 906da2b commit 2685c7d
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 67 deletions.
3 changes: 3 additions & 0 deletions docs/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Documentation

Please find our documentation within the [`src/book`](lmanack/tree/main/src/book) directory for the source or via [https://software-gardening.github.io/almanack](https://software-gardening.github.io/almanack).
106 changes: 79 additions & 27 deletions src/almanack/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,38 +178,90 @@ def detect_encoding(blob_data: bytes) -> str:
raise ValueError("Encoding could not be detected.")


def find_and_read_file(repo: pygit2.Repository, filename: str) -> Optional[str]:
def find_file(
repo: pygit2.Repository, filepath: str, case_insensitive: bool = False
) -> Optional[pygit2.Object]:
"""
Find and read the content of a file in the repository that matches the filename pattern.
Locate a file in the repository by its path.
Args:
repo (str): The path to the repository.
filename (str): The pattern to match against filenames.
repo (pygit2.Repository):
The repository object.
filepath (str):
The path to the file within the repository.
case_insensitive (bool):
If True, perform case-insensitive comparison.
Returns:
Optional[str]: The content of the found file, or None if no matching files are found.
Optional[pygit2.Object]:
The entry of the found file,
or None if no matching file is found.
"""

# Get the tree associated with the latest commit
tree = repo.head.peel().tree
found_entry = None

if not case_insensitive:
try:
found_entry = tree[filepath]
except KeyError:
return None
else:
path_parts = filepath.lower().split("/")
for i, part in enumerate(path_parts):
try:
entry = next(e for e in tree if e.name.lower() == part)
except StopIteration:
return None

if entry.type == pygit2.GIT_OBJECT_TREE:
tree = repo[entry.id]
elif entry.type == pygit2.GIT_OBJECT_BLOB:
if i == len(path_parts) - 1:
found_entry = entry
break
else:
return None
else:
return None

return found_entry


def read_file(
repo: pygit2.Repository,
entry: Optional[pygit2.Object] = None,
filepath: Optional[str] = None,
case_insensitive: bool = False,
) -> Optional[str]:
"""
Read the content of a file from the repository.
# find the first occurrence of a matching file
found_file: Optional[pygit2.Blob] = next(
(
entry
for entry in tree
if entry.type == pygit2.GIT_OBJECT_BLOB
and filename.lower() == entry.name.lower()
),
None,
)

# if we have none, return it early to avoid trying to read nothing
if found_file is None:
return found_file

# Read the content of the first found blob
blob_data: bytes = found_file.data

# Decode and return content as a string
return blob_data.decode(detect_encoding(blob_data))
Args:
repo (pygit2.Repository):
The repository object.
entry (Optional[pygit2.Object]):
The entry of the file to read. If not provided, filepath must be specified.
filepath (Optional[str]):
The path to the file within the repository. Used if entry is not provided.
case_insensitive (bool):
If True, perform case-insensitive comparison when using filepath.
Returns:
Optional[str]:
The content of the file as a string,
or None if the file is not found or reading fails.
"""
if entry is None:
if filepath is None:
raise ValueError("Either entry or filepath must be provided.")
entry = find_file(repo, filepath, case_insensitive)
if entry is None:
return None

try:
blob = repo[entry.id]
blob_data: bytes = blob.data
decoded_data = blob_data.decode(detect_encoding(blob_data))
return decoded_data
except (AttributeError, UnicodeDecodeError):
return None
46 changes: 44 additions & 2 deletions src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pygit2
import yaml

from ..git import clone_repository, find_and_read_file, get_commits, get_edited_files
from ..git import clone_repository, find_file, get_commits, get_edited_files, read_file
from .entropy.calculate_entropy import (
calculate_aggregate_entropy,
calculate_normalized_entropy,
Expand Down Expand Up @@ -141,7 +141,9 @@ def is_citable(repo: pygit2.Repository) -> bool:

# Look for a README.md file and read its content
if (
file_content := find_and_read_file(repo=repo, filename="readme.md")
file_content := read_file(
repo=repo, filepath="readme.md", case_insensitive=True
)
) is not None:
# Check for an H2 heading indicating a citation section
if any(
Expand Down Expand Up @@ -193,6 +195,45 @@ def default_branch_is_not_master(repo: pygit2.Repository) -> bool:
return repo.head.shorthand != "master"


def includes_common_docs(repo: pygit2.Repository) -> bool:
"""
Check whether the repo includes common documentation files and directories
associated with building docsites.
Args:
repo (pygit2.Repository):
The repository object.
Returns:
bool:
True if any common documentation files
are found, False otherwise.
"""
# List of common documentation file paths to check for
common_docs_paths = [
"docs/mkdocs.yml",
"docs/conf.py",
"docs/index.md",
"docs/index.rst",
"docs/index.html",
"docs/readme.md",
"docs/source/readme.md",
"docs/source/index.rst",
"docs/source/index.md",
"docs/src/readme.md",
"docs/src/index.rst",
"docs/src/index.md",
]

# Check each documentation path using the find_file function
for doc_path in common_docs_paths:
if find_file(repo=repo, filepath=doc_path) is not None:
return True # Return True as soon as we find any of the files

# otherwise return false as we didn't find documentation
return False


def compute_repo_data(repo_path: str) -> None:
"""
Computes comprehensive data for a GitHub repository.
Expand Down Expand Up @@ -263,6 +304,7 @@ def compute_repo_data(repo_path: str) -> None:
),
"repo-is-citable": is_citable(repo=repo),
"repo-default-branch-not-master": default_branch_is_not_master(repo=repo),
"repo-includes-common-docs": includes_common_docs(repo=repo),
"repo-agg-info-entropy": normalized_total_entropy,
"repo-file-info-entropy": file_entropy,
}
Expand Down
7 changes: 7 additions & 0 deletions src/almanack/metrics/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ metrics:
description: >-
Boolean value indicating that the repo uses a
default branch name besides 'master'.
- name: "repo-includes-common-docs"
id: "SGA-GL-0007"
result-type: "bool"
description: >-
Boolean value indicating whether the repo includes
common documentation directory and files associated
with building docsites.
- name: "repo-agg-info-entropy"
id: "SGA-VS-0001"
result-type: "float"
Expand Down
21 changes: 12 additions & 9 deletions tests/data/almanack/repo_setup/create_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,28 +160,30 @@ def repo_setup(
) -> pygit2.Repository:
"""
Set up a temporary repository with specified files.
Args:
tmp_path (Path):
The temporary directory where the repo will be created.
repo_path (Path):
The directory where the repo will be created.
files (dict):
A dictionary where keys are filenames and values are their content.
branch_name (str):
A string with the name of the branch which will be used for
committing changes. Defaults to "main".
Returns:
pygit2.Repository: The initialized repository with files.
"""
# Create a new repository in the temporary path
# Create a new repository in the specified path
repo = pygit2.init_repository(repo_path, bare=False)

# Set user.name and user.email in the config
set_repo_user_config(repo)

# Create files in the repository
for filename, content in files.items():
(repo_path / filename).write_text(content)
# Create nested files in the repository
for file_path, content in files.items():
full_path = repo_path / file_path # Construct full path
full_path.parent.mkdir(
parents=True, exist_ok=True
) # Create any parent directories
full_path.write_text(content) # Write the file content

# Stage and commit the files
index = repo.index
Expand All @@ -191,6 +193,7 @@ def repo_setup(
author = repo.default_signature
tree = repo.index.write_tree()

# Commit the files
repo.create_commit(
f"refs/heads/{branch_name}",
author,
Expand All @@ -200,7 +203,7 @@ def repo_setup(
[],
)

# Set the head to the main branch
# Set the HEAD to point to the new branch
repo.set_head(f"refs/heads/{branch_name}")

return repo
101 changes: 86 additions & 15 deletions tests/metrics/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
default_branch_is_not_master,
file_exists_in_repo,
get_table,
includes_common_docs,
is_citable,
)
from tests.data.almanack.repo_setup.create_repo import repo_setup
Expand All @@ -34,21 +35,15 @@ def test_generate_repo_data(entropy_repository_paths: dict[str, pathlib.Path]) -
assert data is not None
assert isinstance(data, dict)

# Check for expected keys
expected_keys = [
"repo-path",
"repo-commits",
"repo-file-count",
"repo-commit-time-range",
"repo-includes-readme",
"repo-includes-contributing",
"repo-includes-code-of-conduct",
"repo-includes-license",
"repo-is-citable",
"repo-agg-info-entropy",
"repo-file-info-entropy",
]
assert all(key in data for key in expected_keys)
# open the metrics table
with open(METRICS_TABLE, "r") as f:
metrics_table = yaml.safe_load(f)

# check that all keys exist in the output from metrics
# table to received dict
assert sorted(data.keys()) == sorted(
[metric["name"] for metric in metrics_table["metrics"]]
)

# Check that repo_path in the output is the same as the input
assert data["repo-path"] == str(repo_path)
Expand Down Expand Up @@ -306,3 +301,79 @@ def test_default_branch_is_not_master(tmp_path):
)

assert not default_branch_is_not_master(repo)


@pytest.mark.parametrize(
"files, expected_result",
[
# Scenario 1: `docs` directory with common documentation files
(
{
"docs/mkdocs.yml": "site_name: Test Docs",
"docs/index.md": "# Welcome to the documentation",
"README.md": "# Project Overview",
},
True,
),
# Scenario 2: `docs` directory without common documentation files
(
{
"docs/random_file.txt": "This is just a random file",
"README.md": "# Project Overview",
},
False,
),
# Scenario 3: No `docs` directory
(
{
"README.md": "# Project Overview",
"src/main.py": "# Main script",
},
False,
),
# Scenario 4: `docs` directory with misleading names
(
{
"docs/mkdoc.yml": "Not a valid mkdocs file",
"docs/INDEX.md": "# Not a documentation index",
},
False,
),
# Scenario 5: `docs` directory with sphinx-like structure
(
{
"docs/source/index.rst": "An rst index",
},
True,
),
# Scenario 6: `docs` directory with sphinx-like structure
(
{
"docs/source/index.md": "An md index",
},
True,
),
# Scenario 6: `docs` directory with a readme under source dir
(
{
"docs/source/readme.md": "A readme for nested docs",
},
True,
),
# test the almanack itseft as a special case
(None, True),
],
)
def test_includes_common_docs(tmp_path, files, expected_result):
"""
Tests includes_common_docs
"""
if files is not None:
repo = repo_setup(repo_path=tmp_path, files=files)
else:
# test the almanack itself
repo_path = pathlib.Path(".").resolve()
repo = pygit2.Repository(str(repo_path))

# Assert that the function returns the expected result
assert includes_common_docs(repo) == expected_result
Loading

0 comments on commit 2685c7d

Please sign in to comment.