Skip to content

Commit

Permalink
switch to strict type checking, fix type errors
Browse files Browse the repository at this point in the history
  • Loading branch information
reinvantveer committed Sep 9, 2024
1 parent f1e15c8 commit 09c1f7a
Show file tree
Hide file tree
Showing 14 changed files with 70 additions and 42 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ tests:

check:
pipenv run flake8 .
pipenv run mypy .
pipenv run mypy . --strict
13 changes: 7 additions & 6 deletions analysis/common_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
import numpy as np
from matplotlib import pyplot as plt
from numpy.lib.stride_tricks import sliding_window_view
from ruamel.yaml import CommentedMap
from sklearn.linear_model import LinearRegression

from analysis.config import load_config, Config
from analysis.config import load_config
from analysis.shared_parsers import extract_year_ticks
from models.bass_diffusion import BassDiffusionModel

Expand All @@ -30,7 +31,7 @@
ModelStats = List[Dict[str, Union[str, float]]]


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()

crawl_cfg = config['data']['common_crawl']
Expand Down Expand Up @@ -82,15 +83,15 @@ def parse_csv(stats: List[Dict[str, str]]) -> StatsDictTable:
return stats_dict


def filter_declining(typed_stats: StatsDictTable) -> MimeDict:
def filter_declining(typed_stats: StatsDictTable) -> dict[str, list[dict[str, int]]]:
"""
Filters the list of statistics for MIME types that decline over the last year
:param typed_stats: a list of dictionaries with typed values
:return: a dictionary of mime types with declining counts, with usage
"""
declining_mime_types: dict = {}
declining_mime_types: dict[str, list[dict[str, int]]] = {}

# First: "de-normalize" the table into a nested dictionary of mime types with page counts per crawl
# This is easier to handle: we want to analyse statistics per mime type, over the years
Expand All @@ -107,7 +108,7 @@ def filter_declining(typed_stats: StatsDictTable) -> MimeDict:
)

mime_types = list(declining_mime_types.keys())
mime_declines = []
mime_declines: list[dict[str, str | int]] = []

for mime_type in mime_types:
crawl_stats = declining_mime_types[mime_type]
Expand Down Expand Up @@ -138,7 +139,7 @@ def filter_declining(typed_stats: StatsDictTable) -> MimeDict:
return declining_mime_types


def analyse(stats: MimeDict, collection_metadata: List[Dict[str, str]], config: Config) -> ModelStats:
def analyse(stats: MimeDict, collection_metadata: List[Dict[str, str]], config: CommentedMap) -> ModelStats:
error_stats: ModelStats = []
# Extract out shorthand for long dict value
cc_cfg = config['data']['common_crawl']
Expand Down
7 changes: 2 additions & 5 deletions analysis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
from os.path import expanduser
from typing import Optional

from ruamel.yaml import YAML

# Type alias for config type
Config = dict
from ruamel.yaml import YAML, CommentedMap


# Configure logging
Expand All @@ -18,7 +15,7 @@
def load_config(
path: str = 'config.yaml',
run_id: Optional[str] = None,
artifact_folder: Optional[str] = None) -> Config:
artifact_folder: Optional[str] = None) -> CommentedMap:
"""
Loads 'config.yaml' from the current working directory, or somewhere else if specified
Expand Down
22 changes: 16 additions & 6 deletions analysis/dans_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@
from typing import Dict

from jsonpath_ng.ext import parse
from ruamel.yaml import CommentedMap
from tqdm import tqdm

from analysis.config import load_config, Config
from analysis.config import load_config

# Unordered key/val
FileTimeStats = Dict[str, Dict[str, int]]


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
"""
Iterates over all pages in the datasets index of the Archaeology Datastation (~120k results in ~12k pages)
Expand Down Expand Up @@ -70,7 +71,10 @@ def main(config: Config) -> int:
return 0


def explain_valid_dataset(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
def explain_valid_dataset(
ds_metadata: dict, # type: ignore[type-arg]
dans_cfg: Dict[str, str]
) -> str:
"""
Analyses a metadata record from the archaeology datastation REST API to validate it for usage in this analysis
Expand Down Expand Up @@ -108,7 +112,10 @@ def explain_valid_dataset(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
return 'Valid'


def extract_content_type_counts(ds_metadata: dict, dans_cfg: dict) -> Dict[str, int]:
def extract_content_type_counts(
ds_metadata: dict, # type: ignore[type-arg]
dans_cfg: CommentedMap
) -> Dict[str, int]:
"""
Collects the filenames of the first version of the dataset.
Expand Down Expand Up @@ -137,7 +144,10 @@ def extract_content_type_counts(ds_metadata: dict, dans_cfg: dict) -> Dict[str,
return content_types


def extract_year_month(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:
def extract_year_month(
ds_metadata: dict, # type: ignore[type-arg]
dans_cfg: dict[str, str]
) -> str:
"""
Collects the correct date for the first version files
It aggregates the file metadata into a counter per file type, per month
Expand All @@ -155,7 +165,7 @@ def extract_year_month(ds_metadata: dict, dans_cfg: Dict[str, str]) -> str:

queried_date = matches[0].value

return queried_date[:7]
return str(queried_date[:7])


if __name__ == '__main__':
Expand Down
7 changes: 4 additions & 3 deletions analysis/dans_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@

import numpy as np
from matplotlib import pyplot as plt
from ruamel.yaml import CommentedMap

from analysis.config import Config, load_config
from analysis.config import load_config
from analysis.shared_parsers import PeriodicFiletypeCount, plot_counts, to_sorted_yearly, SortedFileCount, \
all_filetype_counts, extract_year_ticks, add_cumulative_counts


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()

dans_cfg = config['data']['dans']
Expand Down Expand Up @@ -78,7 +79,7 @@ def main(config: Config) -> int:
return 0


def filter_stats(yearly_stats: SortedFileCount, dans_cfg: dict) -> List[str]:
def filter_stats(yearly_stats: SortedFileCount, dans_cfg: CommentedMap) -> List[str]:
keep_filetypes: List[str] = []
for filetype, yearly_counts in yearly_stats.items():
# We do the exercise below because the mime types included in the "mime_plots" list was decided based on the
Expand Down
15 changes: 10 additions & 5 deletions analysis/dans_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@
from typing import List, Dict, Optional

from bs4 import BeautifulSoup
from ruamel.yaml import CommentedMap
from tqdm import tqdm

from analysis.config import Config, load_config
from analysis.config import load_config
from analysis.loaders_dumpers import get


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
"""
Iterates over all pages in the datasets index of the Archaeology Datastation (~120k results in ~12k pages)
Expand Down Expand Up @@ -89,7 +90,7 @@ def main(config: Config) -> int:
return 0


def dois_from_results(page_num: int, conn: HTTPSConnection, dans_cfg: dict) -> List[str]:
def dois_from_results(page_num: int, conn: HTTPSConnection, dans_cfg: CommentedMap) -> List[str]:
"""
Processes a specific results page indicated by `page_num` from the main Archaeology Datastation datasets index
Expand Down Expand Up @@ -128,7 +129,11 @@ def extract_dois(res_text: str) -> List[str]:
return dois


def scrape_version_metadata(doi: str, conn: HTTPSConnection, dans_cfg: dict) -> Optional[dict]:
def scrape_version_metadata(
doi: str,
conn: HTTPSConnection,
dans_cfg: CommentedMap
) -> Optional[dict[str, list[dict[str, int]]]]:
"""
Extracts a list of original filenames and a deposit date for a dataset designated by `doi`
Expand Down Expand Up @@ -158,7 +163,7 @@ def scrape_version_metadata(doi: str, conn: HTTPSConnection, dans_cfg: dict) ->
url = root_url + versions_subpath.format(doi=doi)
versions = json.loads(get(url, conn))

return versions
return dict(versions)


if __name__ == '__main__':
Expand Down
7 changes: 4 additions & 3 deletions analysis/kb_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@
from argparse import ArgumentParser
from typing import List

from ruamel.yaml import CommentedMap
from tqdm import tqdm

from analysis.config import load_config, Config
from analysis.config import load_config


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()

kb_cfg = config['data']['kb']
file_temp_stats: dict = {}
file_temp_stats: dict[str, dict[str, int]] = {}
skipped_records = 0

with open(kb_cfg['raw_csv_path'], 'rt') as f:
Expand Down
6 changes: 4 additions & 2 deletions analysis/kb_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from argparse import ArgumentParser
from typing import Dict, List, TypedDict

from analysis.config import load_config, Config
from ruamel.yaml import CommentedMap

from analysis.config import load_config
from analysis.shared_parsers import next_year_quarter, plot_counts

Filetype = str
Expand All @@ -23,7 +25,7 @@ class PeriodCount(TypedDict):
SortedFileCount = Dict[Filetype, List[PeriodCount]]


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()
kb_cfg = config['data']['kb']

Expand Down
2 changes: 1 addition & 1 deletion analysis/loaders_dumpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from retry import retry


@retry(tries=3, delay=1, backoff=2)
@retry(tries=3, delay=1, backoff=2) # type: ignore[misc]
def get(url: str, conn: HTTPSConnection) -> str:
"""
Simple helper function to get the text as utf-8 from a url
Expand Down
10 changes: 5 additions & 5 deletions analysis/nibg_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@
import logging
import os.path
from argparse import ArgumentParser
from typing import List

from ruamel.yaml import CommentedMap
from tqdm import tqdm

from analysis.config import load_config, Config
from analysis.config import load_config


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()

nibg_cfg = config['data']['nibg']
file_temp_stats: dict = {}
file_temp_stats: dict[str, dict[str, int]] = {}
skipped_records = 0

with open(nibg_cfg['raw_csv_path'], 'rt') as f:
Expand Down Expand Up @@ -54,7 +54,7 @@ def main(config: Config) -> int:

# Prune stats for formats that have at least 10 entries
formats = list(file_temp_stats.keys())
dropped_formats: List[str] = []
dropped_formats: list[str] = []
min_measurements = nibg_cfg['minimum_time_periods']

for extension in formats:
Expand Down
6 changes: 4 additions & 2 deletions analysis/nibg_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import os
from argparse import ArgumentParser

from analysis.config import load_config, Config
from ruamel.yaml import CommentedMap

from analysis.config import load_config
from analysis.shared_parsers import to_pruned_sorted_quarterly, plot_counts


def main(config: Config) -> int:
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()
nibg_cfg = config['data']['nibg']

Expand Down
11 changes: 10 additions & 1 deletion analysis/shared_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numpy as np
from matplotlib import pyplot as plt
from ruamel.yaml import CommentedMap
from sklearn.linear_model import LinearRegression

from models import BassDiffusionModel
Expand Down Expand Up @@ -226,7 +227,15 @@ def add_cumulative_counts(counts: SortedFileCount, format: str) -> SortedFileCou
return counts


def plot_counts(counts: SortedFileCount, cfg: dict) -> None:
def plot_counts(counts: SortedFileCount, cfg: CommentedMap) -> None:
"""
Plot `counts` following the configuration passed in `cfg`
:param counts: A dict containing time intervals and file counts for the intervals
:param cfg: A configuration section, one under "data" in config.yaml
:return: None
"""
output_dir = cfg['img_output_dir']
num_tests = cfg['num_test_measurements']

Expand Down
2 changes: 1 addition & 1 deletion tests/test_common_crawler_metadata_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from analysis.common_crawl import extract_year_ticks
from analysis.shared_parsers import extract_year_ticks


def test_year_ticks() -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_quarterly_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os

from analysis.config import load_config
from analysis.nibg_analysis import to_pruned_sorted_quarterly
from analysis.shared_parsers import to_pruned_sorted_quarterly


def test_quarterly_counts() -> None:
Expand Down

0 comments on commit 09c1f7a

Please sign in to comment.