Skip to content

Commit

Permalink
Improve separation between layers (#314)
Browse files Browse the repository at this point in the history
* Move Callback and TaskEndEvent to runtime package

* Move visit_nodes and visit_node_generations to runtime package

* Move CubedPipeline to runtime package

* Move spec_to_pipeline to rechunk module

* Move Spec to spec module
  • Loading branch information
tomwhite authored Oct 4, 2023
1 parent 01ef9f9 commit ea88519
Show file tree
Hide file tree
Showing 23 changed files with 334 additions and 326 deletions.
5 changes: 2 additions & 3 deletions cubed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

from .array_api import Array
from .core.array import (
Callback,
Spec,
TaskEndEvent,
compute,
measure_reserved_mem,
measure_reserved_memory,
Expand All @@ -25,6 +22,8 @@
from .core.gufunc import apply_gufunc
from .core.ops import from_array, from_zarr, map_blocks, store, to_zarr
from .nan_functions import nanmean, nansum
from .runtime.types import Callback, TaskEndEvent
from .spec import Spec

__all__ = [
"__version__",
Expand Down
3 changes: 0 additions & 3 deletions cubed/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# flake8: noqa
from .array import (
Callback,
CoreArray,
Spec,
TaskEndEvent,
compute,
gensym,
measure_reserved_mem,
Expand Down
179 changes: 4 additions & 175 deletions cubed/core/array.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from dataclasses import dataclass
from operator import mul
from typing import Optional, TypeVar, Union
from typing import Optional, TypeVar
from warnings import warn

import numpy as np
from toolz import map, reduce

from cubed.runtime.types import Executor
from cubed.runtime.types import Callback, Executor
from cubed.spec import Spec
from cubed.storage.zarr import open_if_lazy_zarr_array
from cubed.utils import chunk_memory, convert_to_bytes
from cubed.utils import chunk_memory
from cubed.vendor.dask.array.core import normalize_chunks

from .plan import arrays_to_plan
Expand Down Expand Up @@ -194,177 +194,6 @@ def __repr__(self):
return f"cubed.core.CoreArray<{self.name}, shape={self.shape}, dtype={self.dtype}, chunks={self.chunks}>"


class Spec:
"""Specification of resources available to run a computation."""

def __init__(
self,
work_dir: Union[str, None] = None,
max_mem: Union[int, None] = None,
allowed_mem: Union[int, str, None] = None,
reserved_mem: Union[int, str, None] = 0,
executor: Union[Executor, None] = None,
storage_options: Union[dict, None] = None,
):
"""
Specify resources available to run a computation.
Parameters
----------
work_dir : str or None
The directory path (specified as an fsspec URL) used for storing intermediate data.
max_mem : int, optional
**Deprecated**. The maximum memory available to a worker for data use for the computation, in bytes.
allowed_mem : int or str, optional
The total memory available to a worker for running a task, in bytes.
If int it should be >=0. If str it should be of form <value><unit> where unit can be kB, MB, GB, TB etc.
This includes any ``reserved_mem`` that has been set.
reserved_mem : int or str, optional
The memory reserved on a worker for non-data use when running a task, in bytes.
If int it should be >=0. If str it should be of form <value><unit> where unit can be kB, MB, GB, TB etc.
executor : Executor, optional
The default executor for running computations.
storage_options : dict, optional
Storage options to be passed to fsspec.
"""

if max_mem is not None:
warn(
"`max_mem` is deprecated, please use `allowed_mem` instead",
DeprecationWarning,
stacklevel=2,
)

self._work_dir = work_dir

self._reserved_mem = convert_to_bytes(reserved_mem or 0)
if allowed_mem is None:
self._allowed_mem = (max_mem or 0) + self.reserved_mem
else:
self._allowed_mem = convert_to_bytes(allowed_mem)

self._executor = executor
self._storage_options = storage_options

@property
def work_dir(self) -> Optional[str]:
"""The directory path (specified as an fsspec URL) used for storing intermediate data."""
return self._work_dir

@property
def allowed_mem(self) -> int:
"""
The total memory available to a worker for running a task, in bytes.
This includes any ``reserved_mem`` that has been set.
"""
return self._allowed_mem

@property
def reserved_mem(self) -> int:
"""
The memory reserved on a worker for non-data use when running a task, in bytes.
See Also
--------
cubed.measure_reserved_mem
"""
return self._reserved_mem

@property
def executor(self) -> Optional[Executor]:
"""The default executor for running computations."""
return self._executor

@property
def storage_options(self) -> Optional[dict]:
"""Storage options to be passed to fsspec."""
return self._storage_options

def __repr__(self) -> str:
return (
f"cubed.Spec(work_dir={self._work_dir}, allowed_mem={self._allowed_mem}, "
f"reserved_mem={self._reserved_mem}, executor={self._executor}, storage_options={self._storage_options})"
)

def __eq__(self, other):
if isinstance(other, Spec):
return (
self.work_dir == other.work_dir
and self.allowed_mem == other.allowed_mem
and self.reserved_mem == other.reserved_mem
and self.executor == other.executor
and self.storage_options == other.storage_options
)
else:
return False


class Callback:
"""Object to receive callback events during array computation."""

def on_compute_start(self, dag, resume):
"""Called when the computation is about to start.
Parameters
----------
dag : networkx.MultiDiGraph
The computation DAG.
"""
pass # pragma: no cover

def on_compute_end(self, dag):
"""Called when the computation has finished.
Parameters
----------
dag : networkx.MultiDiGraph
The computation DAG.
"""
pass # pragma: no cover

def on_task_end(self, event):
"""Called when the a task ends.
Parameters
----------
event : TaskEndEvent
Information about the task execution.
"""
pass # pragma: no cover


@dataclass
class TaskEndEvent:
"""Callback information about a completed task (or tasks)."""

array_name: str
"""Name of the array that the task is for."""

num_tasks: int = 1
"""Number of tasks that this event applies to (default 1)."""

task_create_tstamp: Optional[float] = None
"""Timestamp of when the task was created by the client."""

function_start_tstamp: Optional[float] = None
"""Timestamp of when the function started executing on the remote worker."""

function_end_tstamp: Optional[float] = None
"""Timestamp of when the function finished executing on the remote worker."""

task_result_tstamp: Optional[float] = None
"""Timestamp of when the result of the task was received by the client."""

peak_measured_mem_start: Optional[int] = None
"""Peak memory usage measured on the remote worker before the function starts executing."""

peak_measured_mem_end: Optional[int] = None
"""Peak memory usage measured on the remote worker after the function finishes executing."""


def check_array_specs(arrays):
specs = [a.spec for a in arrays if hasattr(a, "spec")]
if not all(s == specs[0] for s in specs):
Expand Down
26 changes: 2 additions & 24 deletions cubed/core/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import networkx as nx

from cubed.primitive.blockwise import can_fuse_pipelines, fuse
from cubed.primitive.types import CubedPipeline
from cubed.runtime.pipeline import already_computed
from cubed.runtime.pipeline import visit_nodes
from cubed.runtime.types import CubedPipeline
from cubed.storage.zarr import LazyZarrArray
from cubed.utils import chunk_memory, extract_stack_summaries, join_path, memory_repr

Expand Down Expand Up @@ -353,28 +353,6 @@ def new_temp_path(name, suffix=".zarr", spec=None):
return join_path(context_dir, f"{name}{suffix}")


def visit_nodes(dag, resume=None):
"""Return a generator that visits the nodes in the DAG in topological order."""
nodes = {n: d for (n, d) in dag.nodes(data=True)}
for name in list(nx.topological_sort(dag)):
if already_computed(nodes[name], resume=resume):
continue
yield name, nodes[name]


def visit_node_generations(dag, resume=None):
"""Return a generator that visits the nodes in the DAG in groups of topological generations."""
nodes = {n: d for (n, d) in dag.nodes(data=True)}
for names in nx.topological_generations(dag):
gen = [
(name, nodes[name])
for name in names
if not already_computed(nodes[name], resume=resume)
]
if len(gen) > 0:
yield gen


def create_zarr_array(lazy_zarr_array, *, config=None):
"""Stage function for create."""
lazy_zarr_array.create(mode="a")
Expand Down
4 changes: 2 additions & 2 deletions cubed/extensions/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import pandas as pd

from cubed.core.array import Callback
from cubed.core.plan import visit_nodes
from cubed.runtime.pipeline import visit_nodes
from cubed.runtime.types import Callback


class HistoryCallback(Callback):
Expand Down
2 changes: 1 addition & 1 deletion cubed/extensions/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pylab
import seaborn as sns

from cubed.core.array import Callback
from cubed.runtime.types import Callback

sns.set_style("whitegrid")
pylab.switch_backend("Agg")
Expand Down
4 changes: 2 additions & 2 deletions cubed/extensions/tqdm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from toolz import map

from cubed.core.array import Callback
from cubed.core.plan import visit_nodes
from cubed.runtime.pipeline import visit_nodes
from cubed.runtime.types import Callback


class TqdmProgressBar(Callback):
Expand Down
3 changes: 2 additions & 1 deletion cubed/primitive/blockwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
import zarr
from toolz import map

from cubed.runtime.types import CubedPipeline
from cubed.storage.zarr import T_ZarrArray, lazy_empty
from cubed.types import T_Chunks, T_DType, T_Shape, T_Store
from cubed.utils import chunk_memory, get_item, to_chunksize
from cubed.vendor.dask.array.core import normalize_chunks
from cubed.vendor.dask.blockwise import _get_coord_mapping, _make_dims, lol_product
from cubed.vendor.dask.core import flatten

from .types import CubedArrayProxy, CubedPipeline
from .types import CubedArrayProxy

sym_counter = 0

Expand Down
Loading

0 comments on commit ea88519

Please sign in to comment.