Skip to content

Commit

Permalink
Merge pull request #462 from kedhammar/aviti-manifest-tweak-idx-dista…
Browse files Browse the repository at this point in the history
…nces

Improve building AVITI submanifests
  • Loading branch information
kedhammar authored Feb 6, 2025
2 parents c1c8508 + 9fde223 commit 965c864
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 10 deletions.
4 changes: 4 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# TACA Version Log

## 20250205.1

Add feature to AVITI submanifest generation to tweak index mismatch thresholds if necessary.

## 20250128.1

Replace PR labels action
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ flowcell_parser @ git+https://github.com/SciLifeLab/flowcell_parser
pandas
python_crontab
python_dateutil
python_levenshtein
requests
setuptools
83 changes: 73 additions & 10 deletions taca/element/Element_Runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pathlib import Path

import pandas as pd
from Levenshtein import distance

from taca.utils.filesystem import chdir
from taca.utils.statusdb import ElementRunsConnection
Expand Down Expand Up @@ -118,6 +119,52 @@ def get_mask(
return mask


def get_custom_mistmatch_thresholds(df: pd.DataFrame) -> tuple[int, int]:
"""For an AVITI manifest dataframe containing the columns
'Lane', 'Index1', and 'Index2', determine whether the minimum allowed
mismatch threshold for index sequences should be reduced from 1 to 0,
based on the minimum distance between indexes.
"""
df = df.copy()

# Defaults, according to Element documentation
i1MismatchThreshold = 1
i2MismatchThreshold = 1

# Collect distances
idx1_dists = []
idx2_dists = []
total_dists = []
# Iterate across all sample pairings per lane
for lane in df["Lane"].unique():
df_lane = df[df["Lane"] == lane]
df_lane.reset_index(drop=True, inplace=True)
for i in range(0, len(df_lane)):
for j in range(i + 1, len(df_lane)):
idx1_dist = distance(df_lane["Index1"][i], df_lane["Index1"][j])
idx2_dist = distance(df_lane["Index2"][i], df_lane["Index2"][j])

# Collect distances between all sample pairings on index and index-pair level
idx1_dists.append(idx1_dist)
idx2_dists.append(idx2_dist)
total_dists.append(idx1_dist + idx2_dist)

if min(total_dists) == 0:
raise AssertionError("Total index distance of 0 detected.")
if min(idx1_dists) <= 2:
logging.warning(
"Minimum distance between Index1 sequences is at or below 2. Reducing allowed mismatches from 1 to 0."
)
i1MismatchThreshold = 0
if min(idx2_dists) <= 2:
logging.warning(
"Minimum distance between Index2 sequences is at or below 2. Reducing allowed mismatches from 1 to 0."
)
i2MismatchThreshold = 0

return (i1MismatchThreshold, i2MismatchThreshold)


class Run:
"""Defines an Element run"""

Expand Down Expand Up @@ -522,6 +569,7 @@ def make_demux_manifests(
# Break down into groups by non-consolable properties
grouped_df = df_samples.groupby(
[
"Lane",
"I1Mask",
"I2Mask",
"I1UmiMask",
Expand All @@ -534,25 +582,31 @@ def make_demux_manifests(

# Sanity check
if sum([len(group) for _, group in grouped_df]) < len(df_samples):
msg = "Some samples were not included in any submanifest."
msg = "Some manifest sample rows were not included in any submanifest."
logging.error(msg)
raise AssertionError(msg)
elif sum([len(group) for _, group in grouped_df]) > len(df_samples):
logging.warning("Some samples were included in multiple submanifests.")
msg = "Some manifest sample rows were included in multiple submanifests."
logging.error(msg)
raise AssertionError(msg)

# Iterate over groups to build composite manifests
manifest_root_name = f"{self.NGI_run_id}_demux"
manifests = []
n = 0
for (
I1Mask,
I2Mask,
I1UmiMask,
I2UmiMask,
R1Mask,
R2Mask,
settings,
), group in grouped_df:
(
Lane, # Different lanes MAY need different settings, i.e. index mismatch thresholds
I1Mask,
I2Mask,
I1UmiMask,
I2UmiMask,
R1Mask,
R2Mask,
settings,
),
group,
) in grouped_df:
file_name = f"{manifest_root_name}_{n}.csv"

runValues_section = "\n".join(
Expand Down Expand Up @@ -585,10 +639,19 @@ def make_demux_manifests(
else:
raise AssertionError("Both I1 and I2 appear to contain UMIs.")

# Add mismatch threshold settings
i1_mm_threshold, i2_mm_threshold = get_custom_mistmatch_thresholds(group)
settings_kvs["I1MismatchThreshold"] = str(i1_mm_threshold)
settings_kvs["I2MismatchThreshold"] = str(i2_mm_threshold)

# Unpack settings from LIMS manifest
if settings:
for kv in settings.split(" "):
k, v = kv.split(":")
if k in settings_kvs and settings_kvs[k] != v:
logging.warning(
f"Overwriting TACA submanifest setting {k}={settings_kvs[k]} with LIMS setting {v}"
)
settings_kvs[k] = v

settings_section = "\n".join(
Expand Down

0 comments on commit 965c864

Please sign in to comment.