Skip to content

Commit

Permalink
internal
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 675633763
  • Loading branch information
SeqIO Team authored and SeqIO committed Oct 3, 2024
1 parent bbb5ba3 commit 76c934b
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 1 deletion.
12 changes: 11 additions & 1 deletion seqio/beam_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ def __init__(
self._add_provenance = add_provenance
self._tfds_data_dir = tfds_data_dir
self._int64_max = 2**63 - 1
logging.info(
"kano 11: task/split %s/%s source:%s shards:%s",
self._task_name,
self._split,
type(task.source),
task.source.list_shards(split),
)
self.shards = list(enumerate(task.source.list_shards(split)))
if not self.shards:
raise FileNotFoundError(f"No shards found for {task.name} {split}")
Expand Down Expand Up @@ -139,7 +146,9 @@ def _emit_examples(self, shard: Tuple[int, str]):
)
# Truncate if still a large number.
shard_preprocessors_seed %= self._int64_max

logging.info(
"kano 300: _emit_examples gefore get_dataset %s", type(task.source)
)
ds = task.source.get_dataset(
split=self._split,
shard_info=seqio.ShardInfo(
Expand All @@ -148,6 +157,7 @@ def _emit_examples(self, shard: Tuple[int, str]):
shuffle=False,
seed=shard_preprocessors_seed,
)
logging.info("kano 301: _emit_examples after get_dataset")
ds = task.preprocess_precache(ds, seed=shard_preprocessors_seed)
ds = ds.prefetch(tf.data.AUTOTUNE)

Expand Down
6 changes: 6 additions & 0 deletions seqio/dataset_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,11 @@ def __init__(
@property
def splits(self):
"""Overrides since we can't call `info.splits` until after init."""
logging.info(
"kano 14: splits: lazy lookup %s %s",
self.tfds_dataset,
self.tfds_dataset.info,
)
return self._splits or self.tfds_dataset.info.splits

@property
Expand Down Expand Up @@ -1280,6 +1285,7 @@ def output_features(self) -> Mapping[str, Feature]:

@property
def splits(self) -> Sequence[str]:
logging.info("kano 14: %s source %s", self, self.source)
s = self.source.splits
if not s:
raise ValueError(f"Task {self.name} has no splits")
Expand Down
12 changes: 12 additions & 0 deletions seqio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,23 +320,35 @@ def builder(self):
def _get_builder(self, split: Optional[str] = None):
"""Returns the DatasetBuilder for this TFDS dataset."""
dataset, data_dir = self.get_split_params(split)
logging.info("kano 14: builder: %s %s", dataset, data_dir)
builder_key = self._get_builder_key(dataset, data_dir)
if builder_key not in LazyTfdsLoader._MEMOIZED_BUILDERS:
if dataset:
builder_kwargs = self._builder_kwargs if self._builder_kwargs else {}
logging.info(
"kano 14: loading from dir builder: %s %s %s",
dataset,
data_dir,
builder_kwargs,
)
builder = tfds.builder(dataset, data_dir=data_dir, **builder_kwargs)
else:
if self._builder_kwargs:
raise ValueError(
"`builder_kwargs` should be empty when `dataset` value is not"
" present."
)
logging.info(
"kano 14: loading from dir builder: %s %s", dataset, data_dir
)
builder = tfds.builder_from_directory(data_dir)
logging.info("kano 14: builder: %s %s", builder, builder.info)
LazyTfdsLoader._MEMOIZED_BUILDERS[builder_key] = builder
return LazyTfdsLoader._MEMOIZED_BUILDERS[builder_key]

@property
def info(self):
logging.info("kano 14: info: %s %s", type(self), type(self.builder))
return self.builder.info

def _map_split(self, split: str) -> Optional[str]:
Expand Down

0 comments on commit 76c934b

Please sign in to comment.