Skip to content

Commit 9b128bf

Browse files
fineguyThe TensorFlow Datasets Authors
authored and
The TensorFlow Datasets Authors
committed
Support Beam in Croissant preparation.
PiperOrigin-RevId: 679042687
1 parent 3ab829c commit 9b128bf

File tree

3 files changed

+20
-15
lines changed

3 files changed

+20
-15
lines changed

tensorflow_datasets/scripts/cli/build.py

+2-15
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ def _download_and_prepare(
356356
publish_dir=args.publish_dir,
357357
skip_if_published=args.skip_if_published,
358358
overwrite=args.overwrite,
359+
beam_pipeline_options=args.beam_pipeline_options,
359360
)
360361

361362

@@ -384,7 +385,7 @@ def _make_download_config(
384385
if args.update_metadata_only:
385386
kwargs['download_mode'] = tfds.download.GenerateMode.UPDATE_DATASET_INFO
386387

387-
dl_config = tfds.download.DownloadConfig(
388+
return tfds.download.DownloadConfig(
388389
extract_dir=args.extract_dir,
389390
manual_dir=manual_dir,
390391
max_examples_per_split=args.max_examples_per_split,
@@ -393,20 +394,6 @@ def _make_download_config(
393394
**kwargs,
394395
)
395396

396-
# Add Apache Beam options to download config
397-
try:
398-
import apache_beam as beam # pylint: disable=g-import-not-at-top
399-
except ImportError:
400-
beam = None
401-
402-
if beam is not None:
403-
if args.beam_pipeline_options:
404-
dl_config.beam_options = beam.options.pipeline_options.PipelineOptions(
405-
flags=[f'--{opt}' for opt in args.beam_pipeline_options.split(',')]
406-
)
407-
408-
return dl_config
409-
410397

411398
def _get_config_name(
412399
builder_cls: Type[tfds.core.DatasetBuilder],

tensorflow_datasets/scripts/cli/cli_utils.py

+17
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def download_and_prepare(
299299
publish_dir: epath.Path | None,
300300
skip_if_published: bool,
301301
overwrite: bool,
302+
beam_pipeline_options: str | None,
302303
) -> None:
303304
"""Generate a single builder."""
304305
dataset = builder.info.full_name
@@ -312,6 +313,22 @@ def download_and_prepare(
312313
)
313314
return
314315

316+
if not download_config:
317+
download_config = download.DownloadConfig()
318+
319+
# Add Apache Beam options to download config
320+
try:
321+
import apache_beam as beam # pylint: disable=g-import-not-at-top
322+
323+
if beam_pipeline_options:
324+
download_config.beam_options = (
325+
beam.options.pipeline_options.PipelineOptions(
326+
flags=[f'--{opt}' for opt in beam_pipeline_options.split(',')]
327+
)
328+
)
329+
except ImportError:
330+
pass
331+
315332
builder.download_and_prepare(
316333
download_dir=download_dir,
317334
download_config=download_config,

tensorflow_datasets/scripts/cli/croissant.py

+1
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def prepare_croissant_builder(
155155
publish_dir=args.publish_dir,
156156
skip_if_published=args.skip_if_published,
157157
overwrite=args.overwrite,
158+
beam_pipeline_options=None,
158159
)
159160
return builder
160161

0 commit comments

Comments
 (0)