Skip to content

Commit

Permalink
Merge pull request #37 from bcgsc/bugfix/DEVSU-2477-filter-gkb_matches
Browse files Browse the repository at this point in the history
Bugfix/devsu 2477 filter gkb matches
  • Loading branch information
elewis2 authored Nov 8, 2024
2 parents e4689df + 2832eb5 commit 0efbfaa
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 4 deletions.
74 changes: 74 additions & 0 deletions pori_python/ipr/ipr.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,77 @@ def germline_kb_matches(
ret_list.append(alt) # alteration not in any specific keys matches to check.

return ret_list


def multi_variant_filtering(
graphkb_conn: GraphKBConnection,
gkb_matches: List[KbMatch],
excludedTypes: List[str] = ['wildtype'],
) -> List[KbMatch]:
"""Filters out GraphKB matches that doesn't match to all required variants on multi-variant statements
DEVSU-2477
GKB Statements can be conditional to more than one variant, with implicit 'AND' operator. Since variants
are matched only one at a time, any multi-variant statement get matched if one of their conditional
variants is matching the observed ones, making de facto an 'OR' operator between conditions. The current
function is filtering out these incomplete matches.
Note: Wildtype variants are not taken into account at the moment.
Params:
graphkb_conn: the graphkb connection object
gkb_matches: KbMatch statements to be filtered
excludedTypes: List of variant type terms to exclude from filtering. Default to Wildtype
Returns:
filtered list of KbMatch statements
"""
# All matching statements & variants (GKB RIDs)
matching_statement_rids = {match['kbStatementId'] for match in gkb_matches}
matching_variant_rids = {match['kbVariantId'] for match in gkb_matches}

# Get conditions detail on all matching statements
res = graphkb_conn.post(
uri="query",
data={
"target": "Statement",
"filters": {
"@rid": list(matching_statement_rids),
"operator": 'IN',
},
"history": True,
"returnProperties": [
"@rid",
"conditions.@rid",
"conditions.@class",
"conditions.type",
],
},
)
statements = res['result']

# Get set of excluded Vocabulary RIDs for variant types
excluded = {}
if len(excludedTypes) != 0 and excludedTypes[0] != '':
excluded = gkb_vocab.get_terms_set(graphkb_conn, excludedTypes)

# Mapping statements to their conditional variants
# (discarding non-variant conditions & variant conditions from excluded types)
statement_to_variants = {}
for statement in statements:
statement_to_variants[statement['@rid']] = {
el['@rid']
for el in statement['conditions']
if (el['@class'] in VARIANT_CLASSES and el.get('type', '') not in excluded)
}

# Set of statements with complete matching
complete_matching_statements = {
statementRid
for statementRid, variantRids in statement_to_variants.items()
if variantRids.issubset(matching_variant_rids)
}

# Filtering out incompleted matches of gkb_matches
return [
match for match in gkb_matches if match['kbStatementId'] in complete_matching_statements
]
32 changes: 30 additions & 2 deletions pori_python/ipr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
create_key_alterations,
filter_structural_variants,
germline_kb_matches,
multi_variant_filtering,
select_expression_plots,
)
from .summary import auto_analyst_comments
Expand Down Expand Up @@ -246,6 +247,7 @@ def ipr_report(
custom_kb_match_filter=None,
async_upload: bool = False,
mins_to_wait: int = 5,
multi_variant_filter: bool = True,
) -> Dict:
"""Run the matching and create the report JSON for upload to IPR.
Expand All @@ -269,6 +271,7 @@ def ipr_report(
custom_kb_match_filter: function(List[kbMatch]) -> List[kbMatch]
async_upload: use report_async endpoint to upload reports
mins_to_wait: if using report_async, number of minutes to wait for success before exception raised
multi_variant_filter: filters out matches that doesn't match to all required variants on multi-variant statements
Returns:
ipr_conn.upload_report return dictionary
Expand Down Expand Up @@ -300,10 +303,11 @@ def ipr_report(
small_mutations, expression_variants, copy_variants, structural_variants
)

# Setup connections
# IPR CONNECTION
ipr_conn = IprConnection(username, password, ipr_url)
ipr_spec = ipr_conn.get_spec()

# GKB CONNECTION
if graphkb_url:
logger.info(f"connecting to graphkb: {graphkb_url}")
graphkb_conn = GraphKBConnection(graphkb_url)
Expand All @@ -315,9 +319,10 @@ def ipr_report(

graphkb_conn.login(gkb_user, gkb_pass)

# GKB MATCHING
gkb_matches: List[Hashabledict] = []

# Signature category variants
# MATCHING TMB
tmb_variant: IprVariant = {} # type: ignore
tmb_matches = []
if "tmburMutationBurden" in content.keys():
Expand Down Expand Up @@ -351,6 +356,7 @@ def ipr_report(
gkb_matches.extend([Hashabledict(tmb_statement) for tmb_statement in tmb_matches])
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

# MATCHING MSI
msi = content.get("msi", [])
msi_matches = []
msi_variant: IprVariant = {} # type: ignore
Expand All @@ -374,6 +380,7 @@ def ipr_report(
gkb_matches.extend([Hashabledict(msi) for msi in msi_matches])
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

# MATCHING SMALL MUTATIONS
logger.info(f"annotating {len(small_mutations)} small mutations")
gkb_matches.extend(
annotate_positional_variants(
Expand All @@ -382,6 +389,7 @@ def ipr_report(
)
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

# MATCHING STRUCTURAL VARIANTS
logger.info(f"annotating {len(structural_variants)} structural variants")
gkb_matches.extend(
annotate_positional_variants(
Expand All @@ -390,6 +398,7 @@ def ipr_report(
)
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

# MATCHING COPY VARIANTS
logger.info(f"annotating {len(copy_variants)} copy variants")
gkb_matches.extend(
[
Expand All @@ -401,6 +410,7 @@ def ipr_report(
)
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

# MATCHING EXPRESSION VARIANTS
logger.info(f"annotating {len(expression_variants)} expression variants")
gkb_matches.extend(
[
Expand All @@ -412,13 +422,15 @@ def ipr_report(
)
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

# ALL VARIANTS
all_variants: Sequence[IprVariant]
all_variants = expression_variants + copy_variants + structural_variants + small_mutations # type: ignore
if msi_matches:
all_variants.append(msi_variant) # type: ignore
if tmb_matches:
all_variants.append(tmb_variant) # type: ignore

# GKB_MATCHES FILTERING
if match_germline:
# verify germline kb statements matched germline observed variants, not somatic variants
org_len = len(gkb_matches)
Expand All @@ -434,17 +446,28 @@ def ipr_report(
gkb_matches = [Hashabledict(match) for match in custom_kb_match_filter(gkb_matches)]
logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants")

if multi_variant_filter:
logger.info(
f"Filtering out incomplete matches on multi-variant statements for {len(gkb_matches)} matches"
)
gkb_matches = multi_variant_filtering(graphkb_conn, gkb_matches)
logger.info(f"multi_variant_filtering left {len(gkb_matches)} matches")

# KEY ALTERATIONS
key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants)

# GENE INFORMATION
logger.info("fetching gene annotations")
gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants))

# THERAPEUTIC OPTIONS
if generate_therapeutics:
logger.info("generating therapeutic options")
targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants)
else:
targets = []

# ANALYST COMMENTS
logger.info("generating analyst comments")
if generate_comments:
comments = {
Expand All @@ -455,6 +478,7 @@ def ipr_report(
else:
comments = {"comments": ""}

# OUTPUT CONTENT
# thread safe deep-copy the original content
output = json.loads(json.dumps(content))
output.update(
Expand Down Expand Up @@ -491,6 +515,7 @@ def ipr_report(
ipr_result = None
upload_error = None

# UPLOAD TO IPR
if ipr_upload:
try:
logger.info(f"Uploading to IPR {ipr_conn.url}")
Expand All @@ -500,11 +525,14 @@ def ipr_report(
except Exception as err:
upload_error = err
logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True)

# SAVE TO JSON FILE
if output_json_path:
if always_write_output_json or not ipr_result:
logger.info(f"Writing IPR upload json to: {output_json_path}")
with open(output_json_path, "w") as fh:
fh.write(json.dumps(output))

logger.info(f"made {graphkb_conn.request_count} requests to graphkb")
logger.info(f"average load {int(graphkb_conn.load or 0)} req/s")
if upload_error:
Expand Down
47 changes: 45 additions & 2 deletions tests/test_ipr/test_ipr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

from pori_python.graphkb import statement as gkb_statement
from pori_python.graphkb import vocab as gkb_vocab
from pori_python.ipr.ipr import convert_statements_to_alterations, germline_kb_matches
from pori_python.ipr.ipr import (
convert_statements_to_alterations,
germline_kb_matches,
multi_variant_filtering,
)
from pori_python.types import Statement

DISEASE_RIDS = ["#138:12", "#138:13"]
Expand Down Expand Up @@ -142,6 +146,24 @@
},
]

KB_MATCHES_STATEMENTS = [
{
'@rid': SOMATIC_KB_MATCHES[0]['kbStatementId'],
'conditions': [
{'@class': 'PositionalVariant', '@rid': SOMATIC_KB_MATCHES[0]['kbVariantId']},
{'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']},
{'@class': 'Disease', '@rid': ''},
],
},
{
'@rid': SOMATIC_KB_MATCHES[1]['kbStatementId'],
'conditions': [
{'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']},
{'@class': 'PositionalVariant', '@rid': '157:0', 'type': '#999:99'},
],
},
]


@pytest.fixture
def graphkb_conn():
Expand All @@ -157,10 +179,15 @@ def __call__(self, *args, **kwargs):
ret_val = self.return_values[self.index] if self.index < len(self.return_values) else []
return ret_val

class PostMock:
def __call__(self, *args, **kwargs):
# custom return tailored for multi_variant_filtering() testing
return {'result': KB_MATCHES_STATEMENTS}

def mock_get_source(source):
return {"@rid": 0}

conn = Mock(query=QueryMock(), cache={}, get_source=mock_get_source)
conn = Mock(query=QueryMock(), cache={}, get_source=mock_get_source, post=PostMock())

return conn

Expand Down Expand Up @@ -203,6 +230,14 @@ def mock_func(*pos, **kwargs):
monkeypatch.setattr(gkb_vocab, "get_term_tree", mock_func)


@pytest.fixture(autouse=True)
def get_terms_set(monkeypatch):
def mock_func(*pos, **kwargs):
return {'#999:99'}

monkeypatch.setattr(gkb_vocab, "get_terms_set", mock_func)


@pytest.fixture(autouse=True)
def mock_categorize_relevance(monkeypatch):
def mock_func(_, relevance_id):
Expand Down Expand Up @@ -336,3 +371,11 @@ def test_germline_kb_matches(self):
assert not germline_kb_matches(
SOMATIC_KB_MATCHES, GERMLINE_VARIANTS
), "Germline variant matched to KB somatic statement."

def test_multi_variant_filtering(self, graphkb_conn):
assert (
len(multi_variant_filtering(graphkb_conn, SOMATIC_KB_MATCHES, [])) == 1
), 'Incomplete matches filtered, without excluded types'
assert (
len(multi_variant_filtering(graphkb_conn, SOMATIC_KB_MATCHES)) == 2
), 'Incomplete matches filtered, with default excluded types'

0 comments on commit 0efbfaa

Please sign in to comment.