mlcb2024.bib

@Proceedings{MLCB2024,
 address = {Seattle, WA, USA},
 booktitle = {Proceedings of the 19th Machine Learning in Computational Biology meeting},
 conference_number = {19},
 conference_url = {https://mlcb.github.io/},
 editor = {Knowles, David A and Mostafavi, Sara},
 name = {Machine Learning in Computational Biology},
 shortname = {MLCB},
 start = {2024-09-05},
 end = {2024-09-06},
 published = {2024-11-17},
 year = {2024}
}

@InProceedings{Zhao24,
 abstract = {The computational prediction and design of peptide binders targeting specific epitopes within disordered protein regions is crucial in biological and biomedical research, yet it remains challenging due to their highly dynamic nature and the scarcity of experimentally solved binding data. To address this problem, we built an unprecedentedly large-scale library of peptide pairs within stable secondary structures (beta sheets), leveraging newly available AlphaFold predicted structures. We then developed a machine learning method based on the Transformer architecture for the design of specific linear binders, in analogy to a language translation task. Our method, TransformerBeta, accurately predicts specific beta strand interactions and samples sequences with beta-sheet-like molecular properties, while capturing interpretable physico-chemical interaction patterns. As such, it can propose specific candidate binders targeting disordered regions for experimental validation to inform protein design.},
 author = {Zhao, Haowen and Bravi, Barbara and Aprile, Francesco},
 pages = {1-27},
 title = {Computational design of target-specific linear peptide binders with TransformerBeta}
}

@InProceedings{Gadd24,
 abstract = {Changes in the number of copies of certain parts of the genome, known as copy number alterations (CNAs), due to somatic mutation processes are a hallmark of many cancers. This genomic complexity is known to be associated with poorer outcomes for patients but describing its contribution in detail has been difficult. Copy number alterations can affect large regions spanning whole chromosomes or the entire genome itself but can also be localised to only small segments of the genome and no methods exist that allow this multi-scale nature to be quantified. In this paper, we address this using Wave-LSTM, a signal decomposition approach designed to capture the multi-scale structure of complex whole genome copy number profiles. Using wavelet-based source separation in combination with deep learning-based attention mechanisms. We show that Wave-LSTM can be used to derive multi-scale representations from copy number profiles which can be used to decipher sub-clonal structures from single-cell copy number data and to improve survival prediction performance from patient tumour profiles.},
 author = {Gadd, Charles and Yau, Christopher},
 pages = {28-37},
 title = {Wave-LSTM: Multi-scale analysis of somatic whole genome copy number profiles}
}

@InProceedings{Nallapareddy24,
 abstract = {Translation elongation plays an important role in regulating protein concentrations in the cell, and dysregulation of this process has been linked to several human diseases. In this study, we use data from ribo-seq experiments to model ribosome densities, and in turn, predict the speed of translation. The proposed method, RiboGL, combines graph and recurrent neural networks to account for both graph and sequence-based features. The model takes a graph representing the secondary structure of the mRNA sequence as input, which incorporates both sequence and structural codon neighbors. In our experiments, RiboGL greatly outperforms the state-of-the-art RiboMIMO model for ribosome density prediction. We also conduct ablation studies to justify the design choices made in building the pipeline. Additionally, we use gradient-based interpretability to understand how the codon context and the structural neighbors affect the ribosome density at the A-site. By individually analyzing the genes in the dataset, we elucidate how structural neighbors could also potentially play a role in defining the ribosome density. Importantly, since these neighbors can be far away in the sequence, a recurrent model alone could not easily extract this information. This study lays the foundation for understanding how the mRNA secondary structure can be exploited for ribosome density prediction, and how in the future other graph modalities such as features from the nascent polypeptide can be used to further our understanding of translation in general.},
 author = {Nallapareddy, Mohan Vamsi and Craighero, Francesco and Gobet, C\'edric and Naef, Felix and Vandergheynst, Pierre},
 pages = {38-52},
 title = {Towards improving full-length ribosome density prediction by bridging sequence and graph-based representations}
}

@InProceedings{Liu24,
 abstract = {Human gene interaction networks, commonly known as interactomes, encode genes’ functional relationships, which are invaluable knowledge for translational medical research and the mechanistic understanding of complex human diseases. Advanced network embedding techniques have inspired recent efforts to identify novel human disease-associated genes using canonical interactome embeddings. However, a pivotal challenge persists as many complex diseases manifest in specific biological contexts, such as tissues or cell types, while many existing interactomes do not encapsulate such information. Here, we propose CONE (\url{https://github.com/krishnanlab/cone}), a versatile approach to generate context-specific embeddings from any context-free interactomes. The core component of CONE consists of a graph attention network with contextual conditioning, which is trained in a noise-contrastive fashion using contextualized interactome random walks localized around contextual genes. We demonstrate the strong performance of CONE embeddings in identifying disease-associated genes when using known associated biological contexts to the diseases. Furthermore, our approach offers new insights into the biological contexts associated with human diseases.},
 author = {Liu, Renming and Yuan, Hao and Johnson, Kayla and Krishnan, Arjun},
 pages = {53-71},
 title = {CONE: COntext-specific Network Embedding via Contextualized Graph Attention}
}

@InProceedings{Zhang24,
 abstract = {Network inference, the task of reconstructing interactions in a complex system from experimental observables, is a central yet extremely challenging problem in systems biology. While much progress has been made in the last two decades, network inference remains an open problem. For systems observed at steady state, limited insights are available since temporal information is unavailable and thus causal information is lost. Two common avenues for gaining causal insights into system behaviour are to leverage temporal dynamics in the form of trajectories, and to apply interventions such as knock-out perturbations. We propose an approach for leveraging both dynamical and perturbational single cell data to jointly learn cellular trajectories and power network inference. Our approach is motivated by min-entropy estimation for stochastic dynamics and can infer directed and signed networks from time-stamped single cell snapshots.  },
 author = {Zhang, Stephen Y},
 pages = {72-85},
 title = {Joint trajectory and network inference via reference fitting}
}

@InProceedings{Hermann24,
 abstract = {Pretrained protein language models are becoming increasingly popular as a backbone for protein property inference tasks such as structure prediction or function annotation, accelerating biological research. However, related research oftentimes does not consider the effects of data leakage from pretraining on the actual downstream task, resulting in potentially unrealistic performance estimates. Reported generalization might not necessarily be reproducible for proteins highly dissimilar from the pretraining set.  In this work, we measure the effects of data leakage from protein language model pretraining in the domain of protein thermostability prediction. Specifically, we compare two different dataset split strategies: a pretraining-aware split, designed to avoid similarity between pretraining data and the held-out test sets, and a commonly-used naive split, relying on clustering the training data for a downstream task without taking the pretraining data into account. Our experiments suggest that data leakage from language model pretraining shows consistent effects on melting point prediction across all experiments, distorting the measured performance by an average 11.1\% compared to the pretraining-aware split. The source code and our dataset splits are available at https://gitlab.com/dacs-hpi/pretraining-aware-hotprot.},
 author = {Hermann, Leon and Fiedler, Tobias and Nguyen, Hoang An and Nowicka, Melania and Bartoszewicz, Jakub M},
 pages = {106-116},
 title = {Beware of Data Leakage from Protein LLM Pretraining}
}

@InProceedings{Hariri24,
 abstract = {Polyamides, or peptides and proteins, are biomolecules that exist in a broad spectrum of size, structure, and function. Both structure and function are defined by the underlying sequence of amino acids, causing the polyamide to take three-dimensional conformations when in solution. Despite significant efforts and advances in function and conformation prediction, there remains a critical need for computational methods to accurately infer protein function from sequence and structure. Recent advancements in deep learning, particularly Graph Neural Networks, have shown promise in learning the sequence and structure of proteins. However, they fail to capture essential long-range dependencies inherent in the complex and dynamic three-dimensional structures of proteins, leading to issues including oversquashing and oversmoothing. Here, we explore solutions to the challenge of capturing long-range dependencies in graph representations of polyamides, focusing on latent nodes and graph rewiring techniques. While graph rewiring enhances information flow between distant nodes, latent nodes enable the concentration of global information. In addition, we investigate the effectiveness of ChebNet, a spectral backbone, in capturing long-range dependencies. Our unified framework combines these approaches to address the limitations of current methods, offering insights into protein function and regulation. Through experimental analysis, we demonstrate the efficacy of our proposed methods in capturing long-range dependencies.},
 author = {Hariri, Ali and Vandergheynst, Pierre },
 pages = {117-128},
 title = {Graph learning for capturing long-range dependencies in protein structures}
}

@InProceedings{Treyde24,
 abstract = {Predicting a ligand’s bound pose to a target protein is a key component of early-stage computational drug discovery. Recent developments in machine learning methods have focused on improving pose quality at the cost of model runtime. For high-throughput virtual screening applications, this exposes a capability gap that can be filled by moderately accurate but fast pose prediction. To this end, we developed QUICKBIND, a light-weight pose prediction algorithm. We assess QUICKBIND on widely used benchmarks and find that it provides an attractive trade-off between model accuracy and runtime. To facilitate virtual screening applications, we augment QUICKBIND with a binding affinity module and demonstrate its capabilities for multiple clinically-relevant drug targets. Finally, we investigate the mechanistic basis by which QUICKBIND makes predictions and find that it has learned key physicochemical properties of molecular docking, providing new insights into how machine learning models generate protein-ligand poses. By virtue of its simplicity, QUICKBIND can serve as both an effective virtual screening tool and a minimal test bed for exploring new model architectures and innovations. Model code and weights are available at this GitHub repository.},
 author = {Treyde, Wojtek and Kim, Seohyun Chris and Bouatta, Nazim and AlQuraishi, Mohammed},
 pages = {129-152},
 title = {QuickBind: A Light-Weight And Interpretable Molecular Docking Model}
}

@InProceedings{Dip24,
 abstract = {Pathogen identification is pivotal in diagnosing, treating, and preventing diseases, crucial for controlling infections, and safeguarding public health. Traditional alignment-based methods, though widely used, are computationally intense and reliant on extensive reference databases, often failing to detect novel pathogens due to their low sensitivity and specificity. Similarly, conventional machine learning techniques, while promising, require large annotated datasets and extensive feature engineering and are prone to overfitting. Addressing these challenges, we introduce PathoLM, a cutting-edge pathogen language model optimized for the identification of pathogenicity in bacterial and viral sequences. Leveraging the strengths of pre-trained DNA models such as the Nucleotide Transformer, PathoLM requires minimal data for fine-tuning, thereby enhancing pathogen detection capabilities. It effectively captures a broader genomic context, significantly improving the identification of novel and divergent pathogens. We developed a comprehensive data set comprising approximately 30 species of viruses and bacteria, including ESKAPEE pathogens, and seven notably virulent bacterial strains resistant to antibiotics. Additionally, we curated a species classification dataset centered specifically on the ESKAPEE group. In comparative assessments, PathoLM dramatically outperforms existing models like DciPatho, demonstrating robust zero-shot and few-shot capabilities. Furthermore, we expanded PathoLM-Sp for ESKAPEE species classification, where it showed superior performance compared to other advanced deep learning methods, despite the complexities of the task.},
 author = {Dip, Sajib Acharjee},
 pages = {153-161},
 title = {PathoLM: Identifying pathogenicity from the DNA sequence through the Genome Foundation Model}
}

@InProceedings{Macaulay24,
 abstract = {Genetic, molecular, and environmental factors influence diseases through complex interactions with genes, phenotypes, and drugs. Current methods often fail to integrate diverse multi-relational biological data meaningfully, limiting the discovery of novel risk genes and drugs. To address this, we present MedGraphNet, a multi-relational Graph Neural Network (GNN) model designed to infer relationships among drugs, genes, diseases, and phenotypes. MedGraphNet initializes nodes using informative embeddings from existing text knowledge, allowing for robust integration of various data types and improved generalizability. Our results demonstrate that MedGraphNet matches and often outperforms traditional single-relation approaches, particularly in scenarios with isolated or sparsely connected nodes. The model shows generalizability to external datasets, achieving high accuracy in identifying disease-gene associations and drug-phenotype relationships. Notably, MedGraphNet accurately inferred drug side effects without direct training on such data. Using Alzheimer's disease as a case study, MedGraphNet successfully identified relevant phenotypes, genes, and drugs, corroborated by existing literature. These findings demonstrate the potential of integrating multi-relational data with text knowledge to enhance biomedical predictions and drug repurposing for diseases.},
 author = {Macaulay, Oladimeji S and Servilla, Michael and Virupakshappa, Kushal and Arredondo, David and Hu, Yue and Tafoya, Luis and Zhang, Yanfu and Sahu, Avinash},
 pages = {162-182},
 title = {MedGraphNet: Leveraging Multi-Relational Graph Neural Networks and Text Knowledge for Biomedical Predictions}
}