switch to strict type checking, fix type errors

Antfield-Creations · Sep 9, 2024 · 09c1f7a · 09c1f7a
1 parent f1e15c8
commit 09c1f7a
Show file tree

Hide file tree

Showing 14 changed files with 70 additions and 42 deletions.
diff --git a/Makefile b/Makefile
@@ -5,4 +5,4 @@ tests:
 
 check:
 	pipenv run flake8 .
-	pipenv run mypy .
+	pipenv run mypy . --strict
diff --git a/analysis/common_crawl.py b/analysis/common_crawl.py
@@ -12,9 +12,10 @@
 import numpy as np
 from matplotlib import pyplot as plt
 from numpy.lib.stride_tricks import sliding_window_view
+from ruamel.yaml import CommentedMap
 from sklearn.linear_model import LinearRegression
 
-from analysis.config import load_config, Config
+from analysis.config import load_config
 from analysis.shared_parsers import extract_year_ticks
 from models.bass_diffusion import BassDiffusionModel
 
@@ -30,7 +31,7 @@
 ModelStats = List[Dict[str, Union[str, float]]]
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     start = datetime.datetime.now()
 
     crawl_cfg = config['data']['common_crawl']
@@ -82,15 +83,15 @@ def parse_csv(stats: List[Dict[str, str]]) -> StatsDictTable:
     return stats_dict
 
 
-def filter_declining(typed_stats: StatsDictTable) -> MimeDict:
+def filter_declining(typed_stats: StatsDictTable) -> dict[str, list[dict[str, int]]]:
     """
     Filters the list of statistics for MIME types that decline over the last year
 
     :param typed_stats: a list of dictionaries with typed values
 
     :return: a dictionary of mime types with declining counts, with usage
     """
-    declining_mime_types: dict = {}
+    declining_mime_types: dict[str, list[dict[str, int]]] = {}
 
     # First: "de-normalize" the table into a nested dictionary of mime types with page counts per crawl
     # This is easier to handle: we want to analyse statistics per mime type, over the years
@@ -107,7 +108,7 @@ def filter_declining(typed_stats: StatsDictTable) -> MimeDict:
         )
 
     mime_types = list(declining_mime_types.keys())
-    mime_declines = []
+    mime_declines: list[dict[str, str | int]] = []
 
     for mime_type in mime_types:
         crawl_stats = declining_mime_types[mime_type]
@@ -138,7 +139,7 @@ def filter_declining(typed_stats: StatsDictTable) -> MimeDict:
     return declining_mime_types
 
 
-def analyse(stats: MimeDict, collection_metadata: List[Dict[str, str]], config: Config) -> ModelStats:
+def analyse(stats: MimeDict, collection_metadata: List[Dict[str, str]], config: CommentedMap) -> ModelStats:
     error_stats: ModelStats = []
     # Extract out shorthand for long dict value
     cc_cfg = config['data']['common_crawl']

diff --git a/analysis/config.py b/analysis/config.py
@@ -5,10 +5,7 @@
 from os.path import expanduser
 from typing import Optional
 
-from ruamel.yaml import YAML
-
-# Type alias for config type
-Config = dict
+from ruamel.yaml import YAML, CommentedMap
 
 
 # Configure logging
@@ -18,7 +15,7 @@
 def load_config(
         path: str = 'config.yaml',
         run_id: Optional[str] = None,
-        artifact_folder: Optional[str] = None) -> Config:
+        artifact_folder: Optional[str] = None) -> CommentedMap:
     """
     Loads 'config.yaml' from the current working directory, or somewhere else if specified
 

diff --git a/analysis/dans_aggregate.py b/analysis/dans_aggregate.py
@@ -11,15 +11,16 @@
 from typing import Dict
 
 from jsonpath_ng.ext import parse
+from ruamel.yaml import CommentedMap
 from tqdm import tqdm
 
-from analysis.config import load_config, Config
+from analysis.config import load_config
 
 # Unordered key/val
 FileTimeStats = Dict[str, Dict[str, int]]
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     """
     Iterates over all pages in the datasets index of the Archaeology Datastation (~120k results in ~12k pages)
 
@@ -70,7 +71,10 @@ def main(config: Config) -> int:
     return 0
 
 
-def explain_valid_dataset(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
+def explain_valid_dataset(
+        ds_metadata: dict,  # type: ignore[type-arg]
+        dans_cfg: Dict[str, str]
+) -> str:
     """
     Analyses a metadata record from the archaeology datastation REST API to validate it for usage in this analysis
 
@@ -108,7 +112,10 @@ def explain_valid_dataset(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
     return 'Valid'
 
 
-def extract_content_type_counts(ds_metadata: dict, dans_cfg: dict) -> Dict[str, int]:
+def extract_content_type_counts(
+        ds_metadata: dict,  # type: ignore[type-arg]
+        dans_cfg: CommentedMap
+) -> Dict[str, int]:
     """
     Collects the filenames of the first version of the dataset.
 
@@ -137,7 +144,10 @@ def extract_content_type_counts(ds_metadata: dict, dans_cfg: dict) -> Dict[str,
     return content_types
 
 
-def extract_year_month(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
+def extract_year_month(
+        ds_metadata: dict,  # type: ignore[type-arg]
+        dans_cfg: dict[str, str]
+) -> str:
     """
     Collects the correct date for the first version files
     It aggregates the file metadata into a counter per file type, per month
@@ -155,7 +165,7 @@ def extract_year_month(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
 
     queried_date = matches[0].value
 
-    return queried_date[:7]
+    return str(queried_date[:7])
 
 
 if __name__ == '__main__':

diff --git a/analysis/dans_analysis.py b/analysis/dans_analysis.py
@@ -7,13 +7,14 @@
 
 import numpy as np
 from matplotlib import pyplot as plt
+from ruamel.yaml import CommentedMap
 
-from analysis.config import Config, load_config
+from analysis.config import load_config
 from analysis.shared_parsers import PeriodicFiletypeCount, plot_counts, to_sorted_yearly, SortedFileCount, \
     all_filetype_counts, extract_year_ticks, add_cumulative_counts
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     start = datetime.datetime.now()
 
     dans_cfg = config['data']['dans']
@@ -78,7 +79,7 @@ def main(config: Config) -> int:
     return 0
 
 
-def filter_stats(yearly_stats: SortedFileCount, dans_cfg: dict) -> List[str]:
+def filter_stats(yearly_stats: SortedFileCount, dans_cfg: CommentedMap) -> List[str]:
     keep_filetypes: List[str] = []
     for filetype, yearly_counts in yearly_stats.items():
         # We do the exercise below because the mime types included in the "mime_plots" list was decided based on the

diff --git a/analysis/dans_scrape.py b/analysis/dans_scrape.py
@@ -26,13 +26,14 @@
 from typing import List, Dict, Optional
 
 from bs4 import BeautifulSoup
+from ruamel.yaml import CommentedMap
 from tqdm import tqdm
 
-from analysis.config import Config, load_config
+from analysis.config import load_config
 from analysis.loaders_dumpers import get
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     """
     Iterates over all pages in the datasets index of the Archaeology Datastation (~120k results in ~12k pages)
 
@@ -89,7 +90,7 @@ def main(config: Config) -> int:
     return 0
 
 
-def dois_from_results(page_num: int, conn: HTTPSConnection, dans_cfg: dict) -> List[str]:
+def dois_from_results(page_num: int, conn: HTTPSConnection, dans_cfg: CommentedMap) -> List[str]:
     """
     Processes a specific results page indicated by `page_num` from the main Archaeology Datastation datasets index
 
@@ -128,7 +129,11 @@ def extract_dois(res_text: str) -> List[str]:
     return dois
 
 
-def scrape_version_metadata(doi: str, conn: HTTPSConnection, dans_cfg: dict) -> Optional[dict]:
+def scrape_version_metadata(
+        doi: str,
+        conn: HTTPSConnection,
+        dans_cfg: CommentedMap
+) -> Optional[dict[str, list[dict[str, int]]]]:
     """
     Extracts a list of original filenames and a deposit date for a dataset designated by `doi`
 
@@ -158,7 +163,7 @@ def scrape_version_metadata(doi: str, conn: HTTPSConnection, dans_cfg: dict) ->
     url = root_url + versions_subpath.format(doi=doi)
     versions = json.loads(get(url, conn))
 
-    return versions
+    return dict(versions)
 
 
 if __name__ == '__main__':

diff --git a/analysis/kb_aggregate.py b/analysis/kb_aggregate.py
@@ -9,16 +9,17 @@
 from argparse import ArgumentParser
 from typing import List
 
+from ruamel.yaml import CommentedMap
 from tqdm import tqdm
 
-from analysis.config import load_config, Config
+from analysis.config import load_config
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     start = datetime.datetime.now()
 
     kb_cfg = config['data']['kb']
-    file_temp_stats: dict = {}
+    file_temp_stats: dict[str, dict[str, int]] = {}
     skipped_records = 0
 
     with open(kb_cfg['raw_csv_path'], 'rt') as f:

diff --git a/analysis/kb_analysis.py b/analysis/kb_analysis.py
@@ -8,7 +8,9 @@
 from argparse import ArgumentParser
 from typing import Dict, List, TypedDict
 
-from analysis.config import load_config, Config
+from ruamel.yaml import CommentedMap
+
+from analysis.config import load_config
 from analysis.shared_parsers import next_year_quarter, plot_counts
 
 Filetype = str
@@ -23,7 +25,7 @@ class PeriodCount(TypedDict):
 SortedFileCount = Dict[Filetype, List[PeriodCount]]
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     start = datetime.datetime.now()
     kb_cfg = config['data']['kb']
 

diff --git a/analysis/loaders_dumpers.py b/analysis/loaders_dumpers.py
@@ -3,7 +3,7 @@
 from retry import retry
 
 
-@retry(tries=3, delay=1, backoff=2)
+@retry(tries=3, delay=1, backoff=2)  # type: ignore[misc]
 def get(url: str, conn: HTTPSConnection) -> str:
     """
     Simple helper function to get the text as utf-8 from a url

diff --git a/analysis/nibg_aggregate.py b/analysis/nibg_aggregate.py
@@ -7,18 +7,18 @@
 import logging
 import os.path
 from argparse import ArgumentParser
-from typing import List
 
+from ruamel.yaml import CommentedMap
 from tqdm import tqdm
 
-from analysis.config import load_config, Config
+from analysis.config import load_config
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     start = datetime.datetime.now()
 
     nibg_cfg = config['data']['nibg']
-    file_temp_stats: dict = {}
+    file_temp_stats: dict[str, dict[str, int]] = {}
     skipped_records = 0
 
     with open(nibg_cfg['raw_csv_path'], 'rt') as f:
@@ -54,7 +54,7 @@ def main(config: Config) -> int:
 
     # Prune stats for formats that have at least 10 entries
     formats = list(file_temp_stats.keys())
-    dropped_formats: List[str] = []
+    dropped_formats: list[str] = []
     min_measurements = nibg_cfg['minimum_time_periods']
 
     for extension in formats:

diff --git a/analysis/nibg_analysis.py b/analysis/nibg_analysis.py
@@ -4,11 +4,13 @@
 import os
 from argparse import ArgumentParser
 
-from analysis.config import load_config, Config
+from ruamel.yaml import CommentedMap
+
+from analysis.config import load_config
 from analysis.shared_parsers import to_pruned_sorted_quarterly, plot_counts
 
 
-def main(config: Config) -> int:
+def main(config: CommentedMap) -> int:
     start = datetime.datetime.now()
     nibg_cfg = config['data']['nibg']
 

diff --git a/analysis/shared_parsers.py b/analysis/shared_parsers.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 from matplotlib import pyplot as plt
+from ruamel.yaml import CommentedMap
 from sklearn.linear_model import LinearRegression
 
 from models import BassDiffusionModel
@@ -226,7 +227,15 @@ def add_cumulative_counts(counts: SortedFileCount, format: str) -> SortedFileCou
     return counts
 
 
-def plot_counts(counts: SortedFileCount, cfg: dict) -> None:
+def plot_counts(counts: SortedFileCount, cfg: CommentedMap) -> None:
+    """
+    Plot `counts` following the configuration passed in `cfg`
+
+    :param counts: A dict containing time intervals and file counts for the intervals
+    :param cfg: A configuration section, one under "data" in config.yaml
+
+    :return: None
+    """
     output_dir = cfg['img_output_dir']
     num_tests = cfg['num_test_measurements']
 

diff --git a/tests/test_common_crawler_metadata_parser.py b/tests/test_common_crawler_metadata_parser.py
@@ -1,4 +1,4 @@
-from analysis.common_crawl import extract_year_ticks
+from analysis.shared_parsers import extract_year_ticks
 
 
 def test_year_ticks() -> None:

diff --git a/tests/test_quarterly_counts.py b/tests/test_quarterly_counts.py
@@ -2,7 +2,7 @@
 import os
 
 from analysis.config import load_config
-from analysis.nibg_analysis import to_pruned_sorted_quarterly
+from analysis.shared_parsers import to_pruned_sorted_quarterly
 
 
 def test_quarterly_counts() -> None: