Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ONT sample preprocessing #663

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@
!README.md
!CODE_OF_CONDUCT.md
!CONTRIBUTING.md
!barcode-rename.csv
1 change: 1 addition & 0 deletions barcode-rename.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sample_name,barcode
5 changes: 5 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ data-handling:
# the results from the latest run to
archive: ../archive/

#path to barcode folders
barcode_dir: ../barcode_dir/
#path to store renamed FASTQ files
output_dir: data/

quality-criteria:
illumina:
# minimal length of acceptable reads
Expand Down
14 changes: 14 additions & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,20 @@ The structure can be adjusted via the config file: `config/config.yaml` under
- **archive**: path to archive data from the results from the analysis to.
Defaults to `../archive/`.

### ONT sample preprocessing
Automates the extraction of sequencing reads from sample-specific folders, merges the reads from each
barcode into a single FASTQ file, and then renames the files for easy identification and downstream analysis.

- **barcode-rename.csv**: A CSV file containing the barcode sequences and their corresponding sample
identifiers.
- **barcode_dir**: The directory path where the barcode-specific folders are located.
- **output_dir**: The directory path where the renamed FASTQ files will be saved.

To run the tool, use the following command:
```sh
snakemake --config barcode_dir=path/to/barcode/folders output_dir=data/date/ --cores all --use-conda barcode_rename
```

### Sample sheet

The sample sheet contains all samples to be analyzed by UnCoVar. UnCoVar offers
Expand Down
12 changes: 12 additions & 0 deletions workflow/rules/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,15 @@ rule update_sample:
"../envs/python.yaml"
script:
"../scripts/update-sample-sheet.py"


rule barcode_rename:
input:
barcodes="barcode-rename.csv",
barcode_dir=config["barcode_dir"],
log:
"logs/barcode_rename.txt",
conda:
"../envs/python.yaml"
script:
"../scripts/barcode_rename.py"
82 changes: 82 additions & 0 deletions workflow/scripts/barcode_rename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import shutil
import pandas as pd
import gzip
import subprocess
import sys
import os


def get_barcode_dirs(source_directory, barcode_numbers):
try:
barcode_dirs = []
for barcode_number in barcode_numbers:
# Construct the source file path
source_file = f"barcode{barcode_number}"
source_path = os.path.join(source_directory, source_file)

if os.path.exists(source_path):
barcode_dirs.append(source_path)
else:
print(f"Directory '{source_path}' does not exist.")

return barcode_dirs

except Exception as e:
print(f"An error occurred: {str(e)}")
return []


def concatenate_fastq(bc_directory, outfile):
input_files = os.path.join(bc_directory, "*.fastq.gz")
print(input_files)
out_file = f"{os.path.join(outfile, os.path.split(bc_directory)[1])}_all.fastq"
print(out_file)
subprocess.Popen(f"zcat {input_files} > {out_file}", shell=True).wait()


def run_sample_prep(source_directory, barcode_numbers, outfile):
bc_folder = get_barcode_dirs(source_directory, barcode_numbers)
for item in bc_folder:
print(item)
concatenate_fastq(item, outfile)


def rename_files(final_dir):
# rename files
renames = pd.read_csv(barcode_csv)
renames.reset_index(drop=True, inplace=True)

rename_dict = dict(zip(renames["barcode"], renames["sample_name"]))

print(rename_dict)

files = os.listdir(final_dir)
for file in files:
num = file.split("_")[0][-2:]
print(num)
print(
final_dir + file + " " + final_dir + str(rename_dict[int(num)]) + ".fastq"
)
os.rename(final_dir + file, final_dir + str(rename_dict[int(num)]) + ".fastq")
print(files)


config = snakemake.config

barcode_csv = str(snakemake.input.barcodes)
source_path = str(snakemake.input.barcode_dir)
out_dir = str(config["output_dir"])

if not os.path.exists(source_path):
print(f"Source directory '{source_path}' not found.")

# Check if the destination directory exists, create it if not
if not os.path.exists(out_dir):
os.makedirs(out_dir)

# getting barcode numbers
barcode_csv_ = pd.read_csv(barcode_csv, dtype={"barcode": str})
used_barcodes = barcode_csv_["barcode"]

run_sample_prep(source_path, used_barcodes, out_dir)
rename_files(out_dir)
Loading