Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement dandi:files extractor #363

Merged
merged 11 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datalad_registry/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class BaseConfig(OperationConfig):
"bids_dataset",
# === DANDI related extractors ===
"dandi",
# "dandi:files", # Let's not activate this yet by default
"dandi:files",
]

# === worker, Celery, related configuration ===
Expand Down
33 changes: 32 additions & 1 deletion datalad_registry/tasks/utils/builtin_meta_extractors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# This file specifies custom metadata extractors, for datalad_registry, and related
# definitions.
from collections.abc import Callable
import json

from datalad.distribution.dataset import require_dataset
from yaml import load as yaml_load
Expand Down Expand Up @@ -81,8 +82,38 @@ def dlreg_dandi_files_meta_extract(url: RepoUrl) -> URLMetadata:
:return: A `URLMetadata` object containing the extracted metadata ready
:raises FileNotFoundError: If the `.dandi/assets.json` file is not found
at the dataset

Note: This function is meant to be called inside a Celery task for it requires
an active application context of the Flask app
Note: This function must be called with a RepoUrl object with a cache path, i.e.,
one that must have been processed already.
"""
raise NotImplementedError
name = "dandi:files" # Name of this extractor
version = "0.0.1" # Version of this extractor

assert url.cache_path_abs is not None, (
f"Encountered a RepoUrl with no cache path, "
f"with a processed flag set to {url.processed}"
)

with open(url.cache_path_abs / ".dandi/assets.json") as f:
extracted_metadata = json.load(f)

ds = require_dataset(
url.cache_path_abs,
check_installed=True,
purpose="dandiset files metadata extraction",
)

return URLMetadata(
dataset_describe=get_head_describe(ds),
dataset_version=ds.repo.get_hexsha(),
extractor_name=name,
extractor_version=version,
extraction_parameter={},
extracted_metadata=extracted_metadata,
url=url,
)


# A mapping from the names of the supported extractors to the functions
Expand Down
10 changes: 3 additions & 7 deletions datalad_registry/tests/test__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,9 @@ def test_configuration(
},
}

default_metadata_extractors = [
"metalad_core",
"metalad_studyminimeta",
"datacite_gin",
"bids_dataset",
"dandi",
]
default_metadata_extractors = BaseConfig.__fields__[
"DATALAD_REGISTRY_METADATA_EXTRACTORS"
].default

def mock_compile_config_from_env(*_args, **_kwargs):
# noinspection PyTypeChecker
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,31 @@ def test_no_document(self, dandi_repo_url_with_up_to_date_clone, flask_app):
dlreg_dandi_meta_extract(repo_url)


class TestDlregDandiFilesMetaExtract:
def test_valid_input(self, dandi_repo_url_with_up_to_date_clone, flask_app):
"""
Test the case that the argument `url` is a valid `RepoUrl` object with a
valid corresponding dandi dataset in the local cache
"""
from datalad_registry.tasks.utils.builtin_meta_extractors import (
dlreg_dandi_files_meta_extract,
)

repo_url = dandi_repo_url_with_up_to_date_clone[0]
ds_clone = dandi_repo_url_with_up_to_date_clone[2]

with flask_app.app_context():
url_metadata = dlreg_dandi_files_meta_extract(repo_url)

assert url_metadata.dataset_describe == get_head_describe(ds_clone)
assert url_metadata.dataset_version == ds_clone.repo.get_hexsha()
assert url_metadata.extractor_name == "dandi:files"
assert url_metadata.extractor_version == "0.0.1"
assert url_metadata.extraction_parameter == {}
assert url_metadata.extracted_metadata == [{"asset_id": "123"}]
assert url_metadata.url == repo_url


class TestDlregMetaExtract:
def test_unsupported_extractor(
self, dandi_repo_url_with_up_to_date_clone, flask_app
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.read-only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ services:
read-only-db:
condition: service_healthy
ports:
- "${READ_ONLY_WEB_PUBLISH_PORT}:5000"
- "${WEB_PORT_AT_HOST}:5000"
environment:
FLASK_APP: "datalad_registry:create_app"

Expand Down
8 changes: 4 additions & 4 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ services:
RABBITMQ_DEFAULT_USER: "${RABBITMQ_DEFAULT_USER}"
RABBITMQ_DEFAULT_PASS: "${RABBITMQ_DEFAULT_PASS}"
ports:
- "127.0.0.1:35672:5672"
- "127.0.0.1:45672:15672"
- "127.0.0.1:${BROKER_PORT_AT_HOST}:5672"
- "127.0.0.1:${BROKER_MANAGEMENT_PORT_AT_HOST}:15672"
userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly
# See https://github.com/containers/podman-compose/issues/166
# for details.
Expand All @@ -23,7 +23,7 @@ services:
backend:
image: docker.io/redis:7
ports:
- "127.0.0.1:36379:6379"
- "127.0.0.1:${BACKEND_PORT_AT_HOST}:6379"

db:
image: docker.io/postgres:15
Expand All @@ -33,7 +33,7 @@ services:
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
POSTGRES_INITDB_ARGS: --encoding utf8 --locale C
ports:
- "127.0.0.1:35432:5432"
- "127.0.0.1:${DB_PORT_AT_HOST}:5432"
healthcheck:
test: [ "CMD", "pg_isready", "-U", "${POSTGRES_USER}", "-d", "${POSTGRES_DB}", "-q" ]
interval: 30s
Expand Down
12 changes: 6 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
db:
condition: service_healthy
ports:
- "5000:5000"
- "${WEB_PORT_AT_HOST}:5000"
environment: &env
FLASK_APP: "datalad_registry:create_app"

Expand Down Expand Up @@ -89,7 +89,7 @@ services:
FLOWER_NATURAL_TIME: "True"
FLOWER_BASIC_AUTH: "$FLOWER_BASIC_AUTH"
ports:
- "127.0.0.1:5555:5555"
- "127.0.0.1:${MONITOR_PORT_AT_HOST}:5555"
command: [ "/sbin/my_init", "--", "celery", "-A", "datalad_registry.make_celery:celery_app", "flower" ]
volumes:
- ${MONITOR_PATH_AT_HOST}/data:/data
Expand All @@ -108,8 +108,8 @@ services:
RABBITMQ_DEFAULT_PASS: "${RABBITMQ_DEFAULT_PASS}"
RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: "-rabbit consumer_timeout 43200000" # 12 hours in milliseconds
ports:
- "127.0.0.1:5672:5672"
- "127.0.0.1:15672:15672"
- "127.0.0.1:${BROKER_PORT_AT_HOST}:5672"
- "127.0.0.1:${BROKER_MANAGEMENT_PORT_AT_HOST}:15672"
userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly
# See https://github.com/containers/podman-compose/issues/166
# for details.
Expand All @@ -128,7 +128,7 @@ services:
backend:
image: docker.io/redis:7
ports:
- "127.0.0.1:6379:6379"
- "127.0.0.1:${BACKEND_PORT_AT_HOST}:6379"

db:
image: docker.io/postgres:15
Expand All @@ -138,7 +138,7 @@ services:
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
POSTGRES_INITDB_ARGS: --encoding utf8 --locale C
ports:
- "5432:5432"
- "${DB_PORT_AT_HOST}:5432"
userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly
# See https://github.com/containers/podman-compose/issues/166
# for details.
Expand Down
6 changes: 6 additions & 0 deletions env.test
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
# within the same host.
COMPOSE_PROJECT_NAME=dl-registry-test

# Ports of the services used for testing at host
BROKER_PORT_AT_HOST=35672
BROKER_MANAGEMENT_PORT_AT_HOST=45672
BACKEND_PORT_AT_HOST=36379
DB_PORT_AT_HOST=35432

# Variables related to the broker service
RABBITMQ_DEFAULT_USER=tester
RABBITMQ_DEFAULT_PASS=testpass
Expand Down
11 changes: 11 additions & 0 deletions template.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# It is to be copied to a target file named `.env.dev` or `.env.prod`, and the target
# file is to be modified (changing usernames, passwords, etc.).

# The name of the Docker Compose project (stack)
COMPOSE_PROJECT_NAME=datalad-registry

# Bind mount paths at host
WEB_PATH_AT_HOST=./services/web
WORKER_PATH_AT_HOST=./services/worker
Expand All @@ -10,6 +13,14 @@ MONITOR_PATH_AT_HOST=./services/monitor
BROKER_PATH_AT_HOST=./services/broker
DB_PATH_AT_HOST=./services/db

# Ports of the services at host
WEB_PORT_AT_HOST=5000
MONITOR_PORT_AT_HOST=5555
BROKER_PORT_AT_HOST=5672
BROKER_MANAGEMENT_PORT_AT_HOST=15672
BACKEND_PORT_AT_HOST=6379
DB_PORT_AT_HOST=5432

DATALAD_REGISTRY_OPERATION_MODE=DEVELOPMENT # or PRODUCTION

# Variables related to the broker service
Expand Down
7 changes: 3 additions & 4 deletions template.env.read-only
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,13 @@ COMPOSE_PROJECT_NAME=dl-registry-read-only
WEB_PATH_AT_HOST=./services/read-only-web
DB_PATH_AT_HOST=./services/read-only-db

# Ports of the service(s) at host
WEB_PORT_AT_HOST=5000

# Variables related to the db service
POSTGRES_DB=pgdb
POSTGRES_USER=pguser
POSTGRES_PASSWORD=pgpass
# (Make sure that user name and password characters do not need to be escaped for URL format
# or to escape them properly if they do)
SQLALCHEMY_DATABASE_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@read-only-db:5432/${POSTGRES_DB}"


# The port on the host to which the read-only-web service will be bound
READ_ONLY_WEB_PUBLISH_PORT=5000
19 changes: 19 additions & 0 deletions tools/run_dandi_files_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This script initiates Celery tasks to run the `dandi:files` on each processed repo.

from sqlalchemy import select

from datalad_registry import create_app
from datalad_registry.models import RepoUrl, db
from datalad_registry.tasks import extract_ds_meta

flask_app = create_app()

with flask_app.app_context():

# Get the IDs of the processed repo URLs
processed_url_ids = (
db.session.execute(select(RepoUrl.id).filter(RepoUrl.processed)).scalars().all()
)

for url_id in processed_url_ids:
extract_ds_meta.delay(url_id, "dandi:files")