Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data tool. #100

Merged
merged 4 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,8 @@ test:
sphinx:
poetry run make -C docs clean html

open-sphinx:
open docs/build/html/index.html

install:
poetry lock && poetry install --all-extras
6 changes: 6 additions & 0 deletions docs/source/api-reference/data.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.. _data_code_doc:

:mod:`~mltb2.data`
==================

.. automodule:: mltb2.data
111 changes: 111 additions & 0 deletions mltb2/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) 2020 - 2023 Philip May
# Copyright (c) 2021 Sigrun May, Helmholtz-Zentrum für Infektionsforschung GmbH (HZI)
# Copyright (c) 2021 Sigrun May, Ostfalia Hochschule für angewandte Wissenschaften
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Data loading functionality."""

import os
from hashlib import sha256
from typing import Tuple

import joblib
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from mltb2.files import get_and_create_mltb2_data_dir


def _load_colon_data() -> pd.DataFrame:
"""Load colon data (not the labels).

The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.

Returns:
data as pandas DataFrame
"""
# download data file
url = "http://genomics-pubs.princeton.edu/oncology/affydata/I2000.html"
page = requests.get(url, timeout=10)

# check checksum of data file
page_hash = sha256(page.content).hexdigest()
assert page_hash == "74cc7b47d40a0fbca8dde05f42bcb799b7babad29ea634139a221bb4386b1c3d", page_hash

soup = BeautifulSoup(page.content, "html.parser")
page_text = soup.get_text()

page_text_lines = page_text.splitlines()
assert len(page_text_lines) >= 2000
page_text_lines = [[float(s) for s in line.split()] for line in page_text_lines if len(line) > 20]
assert len(page_text_lines) == 2000
assert len(page_text_lines[0]) == 62

data = np.array(page_text_lines).T
data_df = pd.DataFrame(data)
return data_df


def _load_colon_label() -> pd.Series:
"""Load colon label (not the data).

The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.

Returns:
labels as pandas Series
"""
# download data file
url = "http://genomics-pubs.princeton.edu/oncology/affydata/tissues.html"
page = requests.get(url, timeout=10)

# check checksum of data file
page_hash = sha256(page.content).hexdigest()
assert page_hash == "0c5b377c5dd5544d015bff479a4260d5ccf0bcf98657f600a1d37e34193e0f52", page_hash

soup = BeautifulSoup(page.content, "html.parser")
page_text = soup.get_text()
page_text_lines = page_text.splitlines()

label = []

for line in page_text_lines:
try:
i = int(line)
label.append(0 if i > 0 else 1)
except ValueError:
pass # we ignore this

assert len(label) == 62
label_series = pd.Series(label)
return label_series


def load_colon() -> Tuple[pd.Series, pd.DataFrame]:
"""Load colon data.

The data is loaded and parsed from the internet.
Also see `colon tissues probed by oligonucleotide arrays
<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_.

Returns:
Tuple containing labels and data.
"""
filename = "colon.pkl.gz"
mltb2_data_home = get_and_create_mltb2_data_dir()
full_path = os.path.join(mltb2_data_home, filename)
if not os.path.exists(full_path):
data_df = _load_colon_data()
label_series = _load_colon_label()
result = (label_series, data_df)
joblib.dump(result, full_path, compress=("gzip", 3))
else:
result = joblib.load(full_path)

return result
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,14 @@ tiktoken = {version = "*", optional = true}
safetensors = {version = "!=0.3.2", optional = true} # version 0.3.2 has poetry issues
openai = {version = "^0", optional = true}
pyyaml = {version = "*", optional = true}
pandas = {version = "*", optional = true}
beautifulsoup4 = {version = "*", optional = true}
joblib = {version = "*", optional = true}

[tool.poetry.extras]
files = ["platformdirs", "scikit-learn"]
fasttext = ["fasttext-wheel"]
data = ["platformdirs", "scikit-learn", "pandas", "beautifulsoup4", "joblib"]
optuna = ["optuna"]
plot = ["matplotlib"]
somajo = ["SoMaJo"]
Expand Down Expand Up @@ -131,6 +135,8 @@ ignore = [
"PLR0913", # Too many arguments to function call ({c_args} > {max_args})
"S106", # Possible hardcoded password assigned to argument: "{}"
"COM812", # Trailing comma missing
"S101", # Use of `assert` detected
"PLR2004", # Magic value used in comparison,
]

[tool.ruff.per-file-ignores]
Expand Down
32 changes: 32 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2023 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

import pandas as pd

from mltb2.data import _load_colon_data, _load_colon_label, load_colon


def test_load_colon_data():
result = _load_colon_data()
assert result is not None
assert isinstance(result, pd.DataFrame)
assert result.shape == (62, 2000)


def test_load_colon_label():
result = _load_colon_label()
assert result is not None
assert isinstance(result, pd.Series)
assert len(result) == 62


def test_load_colon():
result = load_colon()
assert result is not None
assert isinstance(result, tuple)
assert len(result) == 2
assert isinstance(result[0], pd.Series)
assert isinstance(result[1], pd.DataFrame)
assert result[0].shape == (62,)
assert result[1].shape == (62, 2000)