add folders

scholl-lab · Sep 6, 2023 · 16e3773 · 16e3773
1 parent 3600529
commit 16e3773
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 1 deletion.
diff --git a/analyses/alignment/README.md b/analyses/alignment/README.md
@@ -1,4 +1,4 @@
-# Code for alignment and bam reprocessing
+# Code for alignment and BAM reprocessing
 
 ## calculate all md5 checksums in input folder and subfolders
 ```bash

diff --git a/analyses/alignment/config.yaml b/analyses/alignment/config.yaml
@@ -3,6 +3,7 @@ aligned_folder: results/aligned/
 output_folder: results
 final_bam_folder: results/bqsr
 reference: 'analysis/ref/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz'
+reference_unpacked: 'analysis/ref/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'
 panel_of_normals: 'analysis/GATK_resource_bundle/1000g_pon.hg38.vcf.gz'
 af_only_gnomad: 'analysis/GATK_resource_bundle/af-only-gnomad.hg38.vcf.gz'
 reference_version: 'GRCh38'
diff --git a/files/README.md b/files/README.md
@@ -0,0 +1,76 @@
+# instructions how to generate medtadata for the samples in a project
+
+## 1) Alignment metadata for the genomes project part
+
+# TODO: add instructions for genomes project part
+
+
+## 2) Alignment metadata for the exomes project part
+```
+# command to list all R1 FASTQ files in a folder
+# remove _R1_001.fastq.gz from file names using sed
+# remove the download/ folder name using sed
+find download/ -type f -follow -print|xargs ls | grep "_R1_001.fastq.gz$" | sed 's/_R1_001.fastq.gz//g' | sed 's/download\///g' > fastq_files.txt
+
+# cat the file and pipe to:
+# split pipe input by "/" or "_" using awk and print all fields
+cat fastq_files.txt | awk -F"[/_]" -v OFS="\t" '{for(i=1;i<=NF;i++) printf $i"\t"; print ""}' > fastq_parts.txt
+
+# add the columns from fastq_parts.txt to the fastq_files.txt
+# using paste
+# and remove the subfolder column using sed
+paste fastq_files.txt fastq_parts.txt | sed 's/^.*\///g' > fastq_files_with_parts.txt
+
+# change potential splits that resulted in different number of columns using sed
+# then merge the columns to generate the columns project_sample, sample_sheet_number and lane using awk
+# ! Note: this is specific for each project
+cat fastq_files_with_parts.txt | sed 's/T\tS\t/TS\t/g' | awk -F"\t" -v OFS="\t" '{print $1, $2, $3, $4"-"$5, $6, $7}' > fastq_files_with_parts_merged.txt
+
+# add header using echo
+echo -e "fastq_files_basename\tsubfolder\tmdc_project\tproject_sample\tsample_sheet_number\tlane" | cat - fastq_files_with_parts_merged.txt > metadata.tsv
+
+# remove intermediate files
+rm fastq_files.txt fastq_parts.txt fastq_files_with_parts.txt fastq_files_with_parts_merged.txt
+```
+
+
+## 3) Calling metadata for the genomes project part
+```
+# find all files in a directory and its subdirectories
+# remove the subfolder path from the file names
+# remove the file extension preserving all file names
+find results/bqsr/ -type f -name "*.merged.dedup.bqsr.bam" | sed 's/results\/bqsr\///g' | sed 's/.merged.dedup.bqsr.bam//g' > final_bams.txt
+
+# generate a list of all possible combinations of lines in a file
+awk -v OFS="\t" 'NR==FNR { a[$0]; next } { for (i in a) print i, $0 }' final_bams.txt final_bams.txt > final_bams_combinations.txt
+
+# use awk to generate the sample names for the tumor and normal samples
+# by removing everything up to the last underscore ("_") from the first and second columns
+cat final_bams_combinations.txt | awk -F"[\t_]" -v OFS="\t" '{print $4, $8}' > sample_names_combinations.txt
+
+# use awk to generate the analysis from sample type
+# by removing everything up to the minus ("-") from the first and second columns of sample_names_combinations.txt
+# keep both sample names (columns 1 and 3)
+# for columns 2 and 4 add "vs" in between
+cat sample_names_combinations.txt | awk -F"[\t-]" -v OFS="\t" '{print $1, $3, $2"vs"$4}' > analysis_combinations.txt
+
+# combine the columns from sample_names_combinations.txt, analysis_combinations.txt and final_bams_combinations.txt
+# using paste
+# filter to remove lines containing "NvsN", "NvsF", "NvsFN" "FvsF", "NFvsNF"
+# using grep -v
+# arrange the columns in the order sample_name, analysis, final_bam using sort
+# filter to remove lines where the sample name is the same as the normal sample name
+# using awk
+paste sample_names_combinations.txt final_bams_combinations.txt analysis_combinations.txt | grep -vP "\tNvsN" | grep -vP "\tNvsF" | grep -vP "\tNvsFN" | grep -vP "\tFvsF" | grep -vP "\tNFvsNF" | sort | awk -F"\t" -v OFS="\t" '{if ($5 == $6) print $0}' > final_bams_combinations_merged.txt
+
+# add header using echo
+echo -e "sample1\tsample2\tbam1_file_basename\tbam2_file_basename\tindividual1\tindividual2\tanalysis" | cat - final_bams_combinations_merged.txt > calling_metadata.tsv
+
+# remove intermediate files
+rm final_bams.txt final_bams_combinations.txt sample_names_combinations.txt analysis_combinations.txt final_bams_combinations_merged.txt
+```
+
+
+## 4) Calling metadata for the exomes project part
+
+# TODO: add instructions for exomes project part
diff --git a/files/analysis/README.md b/files/analysis/README.md
@@ -0,0 +1 @@
+# Instructions to download and preparation of files needed for analysis
diff --git a/files/download/README.md b/files/download/README.md
@@ -0,0 +1 @@
+# Instructions to download run files
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Instructions to download and preparation of files needed for analysis