From 924113c6802a0ee805edb0355b5235854fd3b138 Mon Sep 17 00:00:00 2001 From: Philip May Date: Sun, 10 Dec 2023 10:45:49 +0100 Subject: [PATCH] Improve doc and typing. (#108) * Update module names * Update doc in fasttext.py * Update data source URLs * improve get_and_create_mltb2_data_dir doc * improve doc of chunk_md * improve doc of plot module --- mltb2/data.py | 17 ++++++----------- mltb2/fasttext.py | 10 +++++++--- mltb2/files.py | 8 ++++++-- mltb2/md.py | 25 +++++++++++++++++++++---- mltb2/openai.py | 2 +- mltb2/optuna.py | 2 +- mltb2/plot.py | 36 +++++++++++++++++++++++++++++++++--- mltb2/somajo.py | 2 +- mltb2/somajo_transformers.py | 2 +- mltb2/text.py | 2 +- mltb2/transformers.py | 2 +- 11 files changed, 79 insertions(+), 29 deletions(-) diff --git a/mltb2/data.py b/mltb2/data.py index d616c91..de7a277 100644 --- a/mltb2/data.py +++ b/mltb2/data.py @@ -4,7 +4,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""Data loading functionality. +"""Data loading module. Use pip to install the necessary dependencies for this module: ``pip install mltb2[data]`` @@ -28,8 +28,7 @@ def _load_colon_data() -> pd.DataFrame: """Load colon data (not the labels). The data is loaded and parsed from the internet. - Also see `colon tissues probed by oligonucleotide arrays - `_. + Also see ``_. Returns: data as pandas DataFrame @@ -60,8 +59,7 @@ def _load_colon_label() -> pd.Series: """Load colon label (not the data). The data is loaded and parsed from the internet. - Also see `colon tissues probed by oligonucleotide arrays - `_. + Also see ``_. Returns: labels as pandas Series @@ -96,8 +94,7 @@ def load_colon() -> Tuple[pd.Series, pd.DataFrame]: """Load colon data. The data is loaded and parsed from the internet. - Also see `colon tissues probed by oligonucleotide arrays - `_. + Also see ``_. Returns: Tuple containing labels and data. @@ -119,8 +116,7 @@ def load_colon() -> Tuple[pd.Series, pd.DataFrame]: def load_prostate() -> Tuple[pd.Series, pd.DataFrame]: """Load prostate data. - The data is loaded and parsed from `prostate data - `_. + The data is loaded and parsed from ``_. Returns: Tuple containing labels and data. @@ -166,8 +162,7 @@ def load_leukemia_big() -> Tuple[pd.Series, pd.DataFrame]: """Load leukemia (big) data. The data is loaded and parsed from the internet. - Also see `leukemia data - `_. + Also see ``_. Returns: Tuple containing labels and data. diff --git a/mltb2/fasttext.py b/mltb2/fasttext.py index 7b5cbbf..851ad8b 100644 --- a/mltb2/fasttext.py +++ b/mltb2/fasttext.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""fastText specific functionality. +"""fastText specific module. This module is based on `fastText `_. Use pip to install the necessary dependencies for this module: @@ -30,7 +30,11 @@ def __post_init__(self): @staticmethod def get_model_path_and_download() -> str: - """Get the model path and download it if needed.""" + """Get the model path and download it if needed. + + Returns: + The full path to the downloaded model file. + """ model_filename = "lid.176.bin" mltb2_data_home = get_and_create_mltb2_data_dir() model_full_path = os.path.join(mltb2_data_home, model_filename) @@ -47,7 +51,7 @@ def get_model_path_and_download() -> str: return model_full_path - def __call__(self, text, num_lang: int = 10): + def __call__(self, text: str, num_lang: int = 10): """Identify languages of a given text. Args: diff --git a/mltb2/files.py b/mltb2/files.py index 22e628a..468fbac 100644 --- a/mltb2/files.py +++ b/mltb2/files.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""File utils. +"""File utils module. Use pip to install the necessary dependencies for this module: ``pip install mltb2[files]`` @@ -20,6 +20,10 @@ def get_and_create_mltb2_data_dir(mltb2_base_data_dir: Optional[str] = None) -> str: """Return and create mltb data dir. + Args: + mltb2_base_data_dir: The base data directory. If ``None`` the default + user data directory is used. + Returns: The directory path. """ @@ -32,7 +36,7 @@ def get_and_create_mltb2_data_dir(mltb2_base_data_dir: Optional[str] = None) -> return mltb2_data_dir -def fetch_remote_file(dirname, filename, url, sha256_checksum) -> str: +def fetch_remote_file(dirname, filename, url: str, sha256_checksum: str) -> str: """Fetch a file from a remote URL. Args: diff --git a/mltb2/md.py b/mltb2/md.py index 5969c0c..5d21e5b 100644 --- a/mltb2/md.py +++ b/mltb2/md.py @@ -3,7 +3,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""Markdown specific functionality. +"""Markdown specific module. Use pip to install the necessary dependencies for this module: ``pip install mltb2[md]`` @@ -21,7 +21,14 @@ def _chunk_md_by_headline(md_text: str) -> List[str]: - """Chunk Markdown by headlines.""" + """Chunk Markdown by headlines. + + Args: + md_text: The Markdown text to be chunked. + + Returns: + The list of Markdown chunks. + """ positions: List[int] = [m.start() for m in re.finditer(_HEADLINE_REGEX, md_text)] # extend positions @@ -34,14 +41,24 @@ def _chunk_md_by_headline(md_text: str) -> List[str]: def chunk_md(md_text: str) -> List[str]: - """Chunk Markdown by headlines and merge isolated headlines.""" + """Chunk Markdown by headlines and merge isolated headlines. + + Merges isolated headlines with their corresponding subsequent paragraphs. + Headings isolated at the end of ``md_text`` (headings without content) are removed in this process. + + Args: + md_text: The Markdown text to be chunked. + + Returns: + The list of Markdown chunks. + """ md_chunks = _chunk_md_by_headline(md_text) merged_chunks = [] temp_merged_chunk = [] for chunk in md_chunks: temp_merged_chunk.append(chunk) - if "\n" in chunk: # content found + if "\n" in chunk: # content chunk found joined_content = "\n\n".join(temp_merged_chunk) merged_chunks.append(joined_content) temp_merged_chunk = [] diff --git a/mltb2/openai.py b/mltb2/openai.py index 293cde4..bdc3eb3 100644 --- a/mltb2/openai.py +++ b/mltb2/openai.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""OpenAI specific functionality. +"""OpenAI specific module. Use pip to install the necessary dependencies for this module: ``pip install mltb2[openai]`` diff --git a/mltb2/optuna.py b/mltb2/optuna.py index 7eed28c..75689c3 100644 --- a/mltb2/optuna.py +++ b/mltb2/optuna.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""Optuna specific functionality. +"""Optuna specific module. This module is based on `Optuna `_. Use pip to install the necessary dependencies for this module: diff --git a/mltb2/plot.py b/mltb2/plot.py index 7d83d67..875ace8 100644 --- a/mltb2/plot.py +++ b/mltb2/plot.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""A collection of plot tools. +"""Plot tools module. This module is based on `Matplotlib `_. Use pip to install the necessary dependencies for this module: @@ -78,10 +78,26 @@ def twin_axes_timeseries_plot( fig.tight_layout() -def boxplot(values, labels=None, title=None, xlabel=None, ylabel=None, vert=True): +def boxplot( + values, + labels=None, + title: Optional[str] = None, + xlabel: Optional[str] = None, + ylabel: Optional[str] = None, + vert: bool = True, +): """Prints one or more boxplots in a single diagram. This function does not call `matplotlib.pyplot.plot()`. + + Args: + values: Values for the boxplot(s). + labels: Labels for the boxplot(s). + title: Title of the plot. + xlabel: Label for the x-axis. + ylabel: Label for the y-axis. + vert: If ``True`` (default), makes the boxes vertical. + If ``False``, makes horizontal boxes. """ _, ax = plt.subplots() @@ -106,10 +122,24 @@ def boxplot(values, labels=None, title=None, xlabel=None, ylabel=None, vert=True plt.xticks(rotation=90) -def boxplot_dict(values_dict, title=None, xlabel=None, ylabel=None, vert=True): +def boxplot_dict( + values_dict, + title: Optional[str] = None, + xlabel: Optional[str] = None, + ylabel: Optional[str] = None, + vert: bool = True, +): """Create boxplot form dictionary. This function does not call `matplotlib.pyplot.plot()`. + + Args: + values_dict: Dictionary with values for the boxplot(s). + title: Title of the plot. + xlabel: Label for the x-axis. + ylabel: Label for the y-axis. + vert: If ``True`` (default), makes the boxes vertical. + If ``False``, makes horizontal boxes. """ values = [] labels = [] diff --git a/mltb2/somajo.py b/mltb2/somajo.py index 60d7d57..5b97c3a 100644 --- a/mltb2/somajo.py +++ b/mltb2/somajo.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""SoMaJo specific functionality. +"""SoMaJo specific module. This module is based on `SoMaJo `_. Use pip to install the necessary dependencies for this module: diff --git a/mltb2/somajo_transformers.py b/mltb2/somajo_transformers.py index 63a5d5a..18b76ef 100644 --- a/mltb2/somajo_transformers.py +++ b/mltb2/somajo_transformers.py @@ -3,7 +3,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""Hugging Face Transformers and SoMaJo specific functionality. +"""Hugging Face Transformers and SoMaJo specific module. This module is based on `Hugging Face Transformers `_ and diff --git a/mltb2/text.py b/mltb2/text.py index b4ef242..ed61399 100644 --- a/mltb2/text.py +++ b/mltb2/text.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""Text specific functionality.""" +"""Text specific module.""" from typing import Dict, Final, Tuple diff --git a/mltb2/transformers.py b/mltb2/transformers.py index 42a96f1..fcea39f 100644 --- a/mltb2/transformers.py +++ b/mltb2/transformers.py @@ -2,7 +2,7 @@ # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT -"""Hugging Face Transformers specific functionality. +"""Hugging Face Transformers specific module. This module is based on `Hugging Face Transformers `_.