From 2685c7dce42d36bd366916a3580e4f02ed21f15c Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Wed, 13 Nov 2024 16:55:24 -0700 Subject: [PATCH] Add metric for checking common documentation paths (#164) * add docs metric * add docs readme link * update test case for docs * add nested dir tests and capability * update docstrings * Update git.py Co-Authored-By: Faisal Alquaddoomi * move to read_file and find_file Co-Authored-By: Faisal Alquaddoomi * revert to old docstring for test function * Update tests/metrics/test_data.py Co-authored-by: Faisal Alquaddoomi * linting --------- Co-authored-by: Faisal Alquaddoomi Co-authored-by: Faisal Alquaddoomi --- docs/readme.md | 3 + src/almanack/git.py | 106 +++++++++++++----- src/almanack/metrics/data.py | 46 +++++++- src/almanack/metrics/metrics.yml | 7 ++ tests/data/almanack/repo_setup/create_repo.py | 21 ++-- tests/metrics/test_data.py | 101 ++++++++++++++--- tests/test_git.py | 44 +++++--- 7 files changed, 261 insertions(+), 67 deletions(-) create mode 100644 docs/readme.md diff --git a/docs/readme.md b/docs/readme.md new file mode 100644 index 00000000..5d53da3f --- /dev/null +++ b/docs/readme.md @@ -0,0 +1,3 @@ +# Documentation + +Please find our documentation within the [`src/book`](lmanack/tree/main/src/book) directory for the source or via [https://software-gardening.github.io/almanack](https://software-gardening.github.io/almanack). diff --git a/src/almanack/git.py b/src/almanack/git.py index 76db560e..72c916dc 100644 --- a/src/almanack/git.py +++ b/src/almanack/git.py @@ -178,38 +178,90 @@ def detect_encoding(blob_data: bytes) -> str: raise ValueError("Encoding could not be detected.") -def find_and_read_file(repo: pygit2.Repository, filename: str) -> Optional[str]: +def find_file( + repo: pygit2.Repository, filepath: str, case_insensitive: bool = False +) -> Optional[pygit2.Object]: """ - Find and read the content of a file in the repository that matches the filename pattern. + Locate a file in the repository by its path. Args: - repo (str): The path to the repository. - filename (str): The pattern to match against filenames. + repo (pygit2.Repository): + The repository object. + filepath (str): + The path to the file within the repository. + case_insensitive (bool): + If True, perform case-insensitive comparison. Returns: - Optional[str]: The content of the found file, or None if no matching files are found. + Optional[pygit2.Object]: + The entry of the found file, + or None if no matching file is found. """ - - # Get the tree associated with the latest commit tree = repo.head.peel().tree + found_entry = None + + if not case_insensitive: + try: + found_entry = tree[filepath] + except KeyError: + return None + else: + path_parts = filepath.lower().split("/") + for i, part in enumerate(path_parts): + try: + entry = next(e for e in tree if e.name.lower() == part) + except StopIteration: + return None + + if entry.type == pygit2.GIT_OBJECT_TREE: + tree = repo[entry.id] + elif entry.type == pygit2.GIT_OBJECT_BLOB: + if i == len(path_parts) - 1: + found_entry = entry + break + else: + return None + else: + return None + + return found_entry + + +def read_file( + repo: pygit2.Repository, + entry: Optional[pygit2.Object] = None, + filepath: Optional[str] = None, + case_insensitive: bool = False, +) -> Optional[str]: + """ + Read the content of a file from the repository. - # find the first occurrence of a matching file - found_file: Optional[pygit2.Blob] = next( - ( - entry - for entry in tree - if entry.type == pygit2.GIT_OBJECT_BLOB - and filename.lower() == entry.name.lower() - ), - None, - ) - - # if we have none, return it early to avoid trying to read nothing - if found_file is None: - return found_file - - # Read the content of the first found blob - blob_data: bytes = found_file.data - - # Decode and return content as a string - return blob_data.decode(detect_encoding(blob_data)) + Args: + repo (pygit2.Repository): + The repository object. + entry (Optional[pygit2.Object]): + The entry of the file to read. If not provided, filepath must be specified. + filepath (Optional[str]): + The path to the file within the repository. Used if entry is not provided. + case_insensitive (bool): + If True, perform case-insensitive comparison when using filepath. + + Returns: + Optional[str]: + The content of the file as a string, + or None if the file is not found or reading fails. + """ + if entry is None: + if filepath is None: + raise ValueError("Either entry or filepath must be provided.") + entry = find_file(repo, filepath, case_insensitive) + if entry is None: + return None + + try: + blob = repo[entry.id] + blob_data: bytes = blob.data + decoded_data = blob_data.decode(detect_encoding(blob_data)) + return decoded_data + except (AttributeError, UnicodeDecodeError): + return None diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py index 98fd09e4..f62bf332 100644 --- a/src/almanack/metrics/data.py +++ b/src/almanack/metrics/data.py @@ -11,7 +11,7 @@ import pygit2 import yaml -from ..git import clone_repository, find_and_read_file, get_commits, get_edited_files +from ..git import clone_repository, find_file, get_commits, get_edited_files, read_file from .entropy.calculate_entropy import ( calculate_aggregate_entropy, calculate_normalized_entropy, @@ -141,7 +141,9 @@ def is_citable(repo: pygit2.Repository) -> bool: # Look for a README.md file and read its content if ( - file_content := find_and_read_file(repo=repo, filename="readme.md") + file_content := read_file( + repo=repo, filepath="readme.md", case_insensitive=True + ) ) is not None: # Check for an H2 heading indicating a citation section if any( @@ -193,6 +195,45 @@ def default_branch_is_not_master(repo: pygit2.Repository) -> bool: return repo.head.shorthand != "master" +def includes_common_docs(repo: pygit2.Repository) -> bool: + """ + Check whether the repo includes common documentation files and directories + associated with building docsites. + + Args: + repo (pygit2.Repository): + The repository object. + + Returns: + bool: + True if any common documentation files + are found, False otherwise. + """ + # List of common documentation file paths to check for + common_docs_paths = [ + "docs/mkdocs.yml", + "docs/conf.py", + "docs/index.md", + "docs/index.rst", + "docs/index.html", + "docs/readme.md", + "docs/source/readme.md", + "docs/source/index.rst", + "docs/source/index.md", + "docs/src/readme.md", + "docs/src/index.rst", + "docs/src/index.md", + ] + + # Check each documentation path using the find_file function + for doc_path in common_docs_paths: + if find_file(repo=repo, filepath=doc_path) is not None: + return True # Return True as soon as we find any of the files + + # otherwise return false as we didn't find documentation + return False + + def compute_repo_data(repo_path: str) -> None: """ Computes comprehensive data for a GitHub repository. @@ -263,6 +304,7 @@ def compute_repo_data(repo_path: str) -> None: ), "repo-is-citable": is_citable(repo=repo), "repo-default-branch-not-master": default_branch_is_not_master(repo=repo), + "repo-includes-common-docs": includes_common_docs(repo=repo), "repo-agg-info-entropy": normalized_total_entropy, "repo-file-info-entropy": file_entropy, } diff --git a/src/almanack/metrics/metrics.yml b/src/almanack/metrics/metrics.yml index 63df079b..51e54d40 100644 --- a/src/almanack/metrics/metrics.yml +++ b/src/almanack/metrics/metrics.yml @@ -56,6 +56,13 @@ metrics: description: >- Boolean value indicating that the repo uses a default branch name besides 'master'. + - name: "repo-includes-common-docs" + id: "SGA-GL-0007" + result-type: "bool" + description: >- + Boolean value indicating whether the repo includes + common documentation directory and files associated + with building docsites. - name: "repo-agg-info-entropy" id: "SGA-VS-0001" result-type: "float" diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index b652abd4..ae7845b9 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -160,28 +160,30 @@ def repo_setup( ) -> pygit2.Repository: """ Set up a temporary repository with specified files. - Args: - tmp_path (Path): - The temporary directory where the repo will be created. + repo_path (Path): + The directory where the repo will be created. files (dict): A dictionary where keys are filenames and values are their content. branch_name (str): A string with the name of the branch which will be used for committing changes. Defaults to "main". - Returns: pygit2.Repository: The initialized repository with files. """ - # Create a new repository in the temporary path + # Create a new repository in the specified path repo = pygit2.init_repository(repo_path, bare=False) # Set user.name and user.email in the config set_repo_user_config(repo) - # Create files in the repository - for filename, content in files.items(): - (repo_path / filename).write_text(content) + # Create nested files in the repository + for file_path, content in files.items(): + full_path = repo_path / file_path # Construct full path + full_path.parent.mkdir( + parents=True, exist_ok=True + ) # Create any parent directories + full_path.write_text(content) # Write the file content # Stage and commit the files index = repo.index @@ -191,6 +193,7 @@ def repo_setup( author = repo.default_signature tree = repo.index.write_tree() + # Commit the files repo.create_commit( f"refs/heads/{branch_name}", author, @@ -200,7 +203,7 @@ def repo_setup( [], ) - # Set the head to the main branch + # Set the HEAD to point to the new branch repo.set_head(f"refs/heads/{branch_name}") return repo diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index aa80a360..57e69011 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -17,6 +17,7 @@ default_branch_is_not_master, file_exists_in_repo, get_table, + includes_common_docs, is_citable, ) from tests.data.almanack.repo_setup.create_repo import repo_setup @@ -34,21 +35,15 @@ def test_generate_repo_data(entropy_repository_paths: dict[str, pathlib.Path]) - assert data is not None assert isinstance(data, dict) - # Check for expected keys - expected_keys = [ - "repo-path", - "repo-commits", - "repo-file-count", - "repo-commit-time-range", - "repo-includes-readme", - "repo-includes-contributing", - "repo-includes-code-of-conduct", - "repo-includes-license", - "repo-is-citable", - "repo-agg-info-entropy", - "repo-file-info-entropy", - ] - assert all(key in data for key in expected_keys) + # open the metrics table + with open(METRICS_TABLE, "r") as f: + metrics_table = yaml.safe_load(f) + + # check that all keys exist in the output from metrics + # table to received dict + assert sorted(data.keys()) == sorted( + [metric["name"] for metric in metrics_table["metrics"]] + ) # Check that repo_path in the output is the same as the input assert data["repo-path"] == str(repo_path) @@ -306,3 +301,79 @@ def test_default_branch_is_not_master(tmp_path): ) assert not default_branch_is_not_master(repo) + + +@pytest.mark.parametrize( + "files, expected_result", + [ + # Scenario 1: `docs` directory with common documentation files + ( + { + "docs/mkdocs.yml": "site_name: Test Docs", + "docs/index.md": "# Welcome to the documentation", + "README.md": "# Project Overview", + }, + True, + ), + # Scenario 2: `docs` directory without common documentation files + ( + { + "docs/random_file.txt": "This is just a random file", + "README.md": "# Project Overview", + }, + False, + ), + # Scenario 3: No `docs` directory + ( + { + "README.md": "# Project Overview", + "src/main.py": "# Main script", + }, + False, + ), + # Scenario 4: `docs` directory with misleading names + ( + { + "docs/mkdoc.yml": "Not a valid mkdocs file", + "docs/INDEX.md": "# Not a documentation index", + }, + False, + ), + # Scenario 5: `docs` directory with sphinx-like structure + ( + { + "docs/source/index.rst": "An rst index", + }, + True, + ), + # Scenario 6: `docs` directory with sphinx-like structure + ( + { + "docs/source/index.md": "An md index", + }, + True, + ), + # Scenario 6: `docs` directory with a readme under source dir + ( + { + "docs/source/readme.md": "A readme for nested docs", + }, + True, + ), + # test the almanack itseft as a special case + (None, True), + ], +) +def test_includes_common_docs(tmp_path, files, expected_result): + """ + Tests includes_common_docs + """ + if files is not None: + repo = repo_setup(repo_path=tmp_path, files=files) + else: + # test the almanack itself + repo_path = pathlib.Path(".").resolve() + repo = pygit2.Repository(str(repo_path)) + + # Assert that the function returns the expected result + assert includes_common_docs(repo) == expected_result diff --git a/tests/test_git.py b/tests/test_git.py index 654cb7a5..adb6fd61 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -11,11 +11,12 @@ from almanack.git import ( clone_repository, detect_encoding, - find_and_read_file, + find_file, get_commits, get_edited_files, get_loc_changed, get_most_recent_commits, + read_file, ) @@ -129,21 +130,36 @@ def test_detect_encoding(byte_data, expected_encoding, should_raise): @pytest.mark.parametrize( "filename, expected_content", [ - ("README.md", "## Citation"), # Exact match for file1.txt - ("readme", None), # Partial match, should return content of file1.txt - ("nonexistent.txt", None), # Non-existent file + ("README.md", "## Citation"), + ("readme", None), + ("nonexistent.txt", None), ], ) -def test_find_and_read_file(repo_with_citation_in_readme, filename, expected_content): - """Test finding and reading files in the repository with various filename patterns.""" - - # Call the function under test - result = find_and_read_file(repo_with_citation_in_readme, filename) +def test_find_file_and_read_file( + repo_with_citation_in_readme, filename, expected_content +): + """ + Test finding and reading files in the repository with various filename patterns. + """ - # Assert the result based on the expected content + find_file_result = find_file(repo=repo_with_citation_in_readme, filepath=filename) + # test for none returns if expected_content is None: - assert result is None # Expecting None for non-existent files - else: + assert find_file_result is expected_content assert ( - result == expected_content - ) # Expecting the actual content for found files + read_file(repo=repo_with_citation_in_readme, filepath=filename) + is expected_content + ) + + # test for content + else: + assert isinstance(find_file_result, pygit2.Object) + read_file_result_filepath = read_file( + repo=repo_with_citation_in_readme, filepath=filename + ) + read_file_result_pygit_obj = read_file( + repo=repo_with_citation_in_readme, entry=find_file_result + ) + + assert read_file_result_filepath == expected_content + assert read_file_result_filepath == read_file_result_pygit_obj