nextstrain · victorlin · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,9 +5,11 @@
 ### Bug Fixes
 
 * filter: Previously, when `--subsample-max-sequences` was slightly lower than the number of groups, it was possible to fail with an uncaught `AssertionError`. Internal calculations have been adjusted to prevent this from happening. [#1588][] [#1598][] (@victorlin)
+* filter: Options `--output` and `-o` have been deprecated and will now show a warning message. See [DEPRECATED.md](./DEPRECATED.md) for details. [#1622][] (@victorlin)
 
 [#1588]: https://github.com/nextstrain/augur/issues/1588
 [#1598]: https://github.com/nextstrain/augur/issues/1598
+[#1622]: https://github.com/nextstrain/augur/pull/1622
 
 ## 25.4.0 (3 September 2024)
 

diff --git a/DEPRECATED.md b/DEPRECATED.md
@@ -4,6 +4,13 @@ These features are deprecated, which means they are no longer maintained and
 will go away in a future major version of Augur. They are currently still
 available for backwards compatibility, but should not be used in new code.
 
+## `augur filter` options `--output` and `-o`
+
+*Deprecated in version __NEXT__. Planned for removal February 2025 or after.*
+
+`--output` and `-o` are aliases to `--output-sequences`. They will be removed to
+avoid confusion with other output options.
+
 ## `xopen` major version 1
 
 *Deprecated in version __NEXT__ (July 2024). Planned for removal at the earliest with major version 26 not before November 2024*

diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -1,10 +1,11 @@
 """
 Filter and subsample a sequence set.
 """
-from augur.argparse_ import ExtendOverwriteDefault
+from augur.argparse_ import ExtendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
 from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
 from augur.filter.io import ACCEPTED_TYPES, column_type_pair
 from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN
+from augur.io.print import print_err
 from augur.types import EmptyOutputReportingMethod
 from . import constants
 
@@ -101,8 +102,8 @@ def register_arguments(parser):
     Since priorities represent relative values between strains, these values can be arbitrary.""")
     subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")
 
-    output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
-    output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
+    output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output-sequences, --output-metadata, --output-strains) is required")
+    output_group.add_argument('--output-sequences', help="filtered sequences in FASTA format")
     output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
     output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
     output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
@@ -114,6 +115,10 @@ def register_arguments(parser):
         default=EmptyOutputReportingMethod.ERROR,
         help="How should empty outputs be reported when no strains pass filtering and/or subsampling.")
 
+    deprecated_group = parser.add_argument_group("deprecated", "options to be removed in a future major version")
+    deprecated_group.add_argument('--output', metavar="FILE", help="alias to --output-sequences" + SKIP_AUTO_DEFAULT_IN_HELP)
+    deprecated_group.add_argument('-o', metavar="FILE", help="alias to --output-sequences" + SKIP_AUTO_DEFAULT_IN_HELP)
+
     parser.set_defaults(probabilistic_sampling=True)
 
 
@@ -127,6 +132,13 @@ def run(args):
     '''
     filter and subsample a set of sequences into an analysis set
     '''
+    if args.o:
+        print_err("WARNING: -o is deprecated. Use --output-sequences instead.")
+        args.output_sequences = args.o
+    if args.output:
+        print_err("WARNING: --output is deprecated. Use --output-sequences instead.")
+        args.output_sequences = args.output
+
     from .validate_arguments import validate_arguments
     # Validate arguments before attempting any I/O.
     validate_arguments(args)

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -376,10 +376,10 @@ def run(args):
     # to update the set of strains to keep based on which strains are actually
     # available.
     if is_vcf:
-        if args.output:
+        if args.output_sequences:
             # Get the samples to be deleted, not to keep, for VCF
             dropped_samps = list(sequence_strains - valid_strains)
-            write_vcf(args.sequences, args.output, dropped_samps)
+            write_vcf(args.sequences, args.output_sequences, dropped_samps)
     elif args.sequences:
         sequences = read_sequences(args.sequences)
 
@@ -388,9 +388,9 @@ def run(args):
         # Even if we aren't emitting sequences, we track the observed strain
         # names in the sequence file as part of the single pass to allow
         # comparison with the provided sequence index.
-        if args.output:
+        if args.output_sequences:
             observed_sequence_strains = set()
-            with open_file(args.output, "wt") as output_handle:
+            with open_file(args.output_sequences, "wt") as output_handle:
                 for sequence in sequences:
                     observed_sequence_strains.add(sequence.id)
 

diff --git a/augur/filter/io.py b/augur/filter/io.py
@@ -160,8 +160,8 @@ def column_type_pair(input: str):
 
 def cleanup_outputs(args):
     """Remove output files. Useful when terminating midway through a loop of metadata chunks."""
-    if args.output:
-        _try_remove(args.output)
+    if args.output_sequences:
+        _try_remove(args.output_sequences)
     if args.output_metadata:
         _try_remove(args.output_metadata)
     if args.output_strains:

diff --git a/augur/filter/validate_arguments.py b/augur/filter/validate_arguments.py
@@ -19,11 +19,11 @@ def validate_arguments(args):
         Parsed arguments from argparse
     """
     # Don't allow sequence output when no sequence input is provided.
-    if args.output and not args.sequences:
+    if args.output_sequences and not args.sequences:
         raise AugurError("You need to provide sequences to output sequences.")
 
     # Confirm that at least one output was requested.
-    if not any((args.output, args.output_metadata, args.output_strains)):
+    if not any((args.output_sequences, args.output_metadata, args.output_strains)):
         raise AugurError("You need to select at least one output.")
 
     # Don't allow filtering on sequence-based information, if no sequences or

diff --git a/tests/functional/filter/cram/filter-deprecated-options.t b/tests/functional/filter/cram/filter-deprecated-options.t
@@ -0,0 +1,56 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create files
+
+  $ cat >metadata.tsv <<~~
+  > strain	col1
+  > SEQ1	A
+  > SEQ2	B
+  > ~~
+
+  $ cat >sequences.fasta <<~~
+  > >SEQ1
+  > AAAA
+  > >SEQ2
+  > AAAA
+  > >SEQ3
+  > AAAA
+  > ~~
+
+--output alias to --output-sequences
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --sequences sequences.fasta \
+  >   --output filtered.fasta
+  WARNING: --output is deprecated. Use --output-sequences instead.
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  1 strain was dropped during filtering
+  	1 had no metadata
+  2 strains passed all filters
+
+  $ cat filtered.fasta
+  >SEQ1
+  AAAA
+  >SEQ2
+  AAAA
+
+-o alias to --output-sequences
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --sequences sequences.fasta \
+  >   -o filtered.fasta
+  WARNING: -o is deprecated. Use --output-sequences instead.
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  1 strain was dropped during filtering
+  	1 had no metadata
+  2 strains passed all filters
+
+  $ cat filtered.fasta
+  >SEQ1
+  AAAA
+  >SEQ2
+  AAAA
diff --git a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t
@@ -9,6 +9,6 @@ This should fail.
   >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
   >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --min-length 10000 \
-  >  --output filtered.fasta > /dev/null
+  >  --output-sequences filtered.fasta > /dev/null
   ERROR: You need to provide sequences to output sequences.
   [2]
diff --git a/tests/functional/filter/cram/filter-sequences-vcf.t b/tests/functional/filter/cram/filter-sequences-vcf.t
@@ -8,7 +8,7 @@ Filter TB strains from VCF and save as a list of filtered strains.
   >  --sequences "$TESTDIR/../data/tb.vcf.gz" \
   >  --metadata "$TESTDIR/../data/tb_metadata.tsv" \
   >  --min-date 2012 \
-  >  --output filtered.vcf \
+  >  --output-sequences filtered.vcf \
   >  --output-strains filtered_strains.txt > /dev/null
   Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
   162 strains were dropped during filtering

diff --git a/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t b/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t
@@ -13,6 +13,6 @@ This should fail, as probabilistic sampling is explicitly disabled.
   >  --subsample-max-sequences 5 \
   >  --subsample-seed 314159 \
   >  --no-probabilistic-sampling \
-  >  --output filtered.fasta
+  >  --output-sequences filtered.fasta
   ERROR: Asked to provide at most 5 sequences, but there are 8 groups.
   [2]