-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #242 from waveygang/specify-queries
Specify queries
- Loading branch information
Showing
9 changed files
with
248 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import argparse | ||
import gzip | ||
import itertools | ||
|
||
def parse_fasta_index(fasta_file): | ||
fai_file = fasta_file + '.fai' | ||
sequences = [] | ||
with open(fai_file, 'r') as file: | ||
for line in file: | ||
sequence_name = line.strip().split('\t')[0] | ||
sequences.append(sequence_name) | ||
return sequences | ||
|
||
def group_sequences(sequences, grouping): | ||
grouped_sequences = {} | ||
for sequence in sequences: | ||
if '#' in sequence: | ||
fields = sequence.split('#') | ||
if grouping in ['g', 'genome']: | ||
group_key = fields[0] | ||
elif grouping in ['h', 'haplotype']: | ||
group_key = '#'.join(fields[:2]) | ||
elif grouping in ['c', 'contig']: | ||
group_key = sequence | ||
else: | ||
raise ValueError(f"Invalid grouping: {grouping}") | ||
else: | ||
group_key = sequence | ||
|
||
if group_key not in grouped_sequences: | ||
grouped_sequences[group_key] = [] | ||
grouped_sequences[group_key].append(sequence) | ||
|
||
return grouped_sequences | ||
|
||
def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries): | ||
pairings = [] | ||
target_groups = list(target_grouped_sequences.keys()) | ||
query_groups = list(query_grouped_sequences.keys()) | ||
|
||
for target_group in target_groups: | ||
query_pool = [group for group in query_groups if group != target_group] | ||
|
||
for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries): | ||
query_chunk = [q for q in query_chunk if q is not None] | ||
pairings.append((target_group, query_chunk)) | ||
|
||
return pairings | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.') | ||
parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)') | ||
parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)') | ||
parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)') | ||
parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)') | ||
parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines') | ||
|
||
args, wfmash_args = parser.parse_known_args() | ||
|
||
# Parse the FASTA index file | ||
sequences = parse_fasta_index(args.fasta_file) | ||
|
||
# Group sequences based on the specified grouping levels | ||
target_grouped_sequences = group_sequences(sequences, args.target_grouping) | ||
query_grouped_sequences = group_sequences(sequences, args.query_grouping) | ||
|
||
# Generate pairings | ||
pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries) | ||
|
||
# Save or print the pairings or command lines | ||
if wfmash_args: | ||
wfmash_options = ' '.join(wfmash_args) | ||
if args.output: | ||
with open(args.output, 'w') as file: | ||
for target_group, query_groups in pairings: | ||
file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n") | ||
else: | ||
for target_group, query_groups in pairings: | ||
print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}") | ||
else: | ||
if args.output: | ||
with open(args.output, 'w') as file: | ||
for target_group, query_groups in pairings: | ||
file.write(f"{target_group}\t{','.join(query_groups)}\n") | ||
else: | ||
for target_group, query_groups in pairings: | ||
print(f"{target_group}\t{','.join(query_groups)}") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash | ||
|
||
# run from root of the repository | ||
|
||
mkdir source-tarball | ||
cd source-tarball | ||
git clone --recursive https://github.com/waveygang/wfmash | ||
cd wfmash | ||
git fetch --tags origin | ||
LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)" | ||
git checkout "${LATEST_TAG}" | ||
git submodule update --init --recursive | ||
bash scripts/generate_git_version.sh $PWD/src | ||
sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i | ||
rm -Rf .git | ||
find src/common -name ".git" -exec rm -Rf "{}" \; | ||
cd .. | ||
mv wfmash "wfmash-${LATEST_TAG}" | ||
tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}" | ||
rm -Rf "wfmash-${LATEST_TAG}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.