diff --git a/CONTRIBUTORS.yaml b/CONTRIBUTORS.yaml index 7fff425e084e57..0c6a4654c80693 100644 --- a/CONTRIBUTORS.yaml +++ b/CONTRIBUTORS.yaml @@ -949,6 +949,12 @@ lldelisle: lat: 46.52 lon: 6.56 +lisanna: + name: Lisanna Paladin + joined: 2023-09 + email: lisanna.paladin@gmail.com + orcid: 0000-0003-0011-9397 + lleroi: name: Laura Leroi email: laura.leroi@ifremer.fr @@ -1276,6 +1282,12 @@ pauldg: elixir_node: be orcid: 0000-0002-8940-4946 +paulzierep: + name: Paul Zierep + email: zierep@informatik.uni-freiburg.de + orcid: 0000-0003-2982-388X + joined: 2023-02 + pavanvidem: name: Pavankumar Videm orcid: 0000-0002-5192-126X @@ -1704,14 +1716,6 @@ yvanlebras: joined: 2017-09 elixir_node: fr -paulzierep: - name: Paul Zierep - email: zierep@informatik.uni-freiburg.de - orcid: 0000-0003-2982-388X - joined: 2023-02 - - - # Funders / External Entities carpentries: @@ -1853,4 +1857,15 @@ ai4life: funding_system: cordis funding_statement: | AI4Life has received funding from the European Union’s Horizon Europe research and innovation programme under grant agreement number 101057970. - + +biont: + name: BioNT + github: false + joined: 2023-09 + avatar: "http://biont-training.eu/assets/img/BioNT_Logo_Layout_2106_blue_Version_2_cropped.png" + url: https://biont-training.eu/ + funder: true + funding_id: "101100604" + funding_system: cordis + funding_statement: | + Co-funded by the European Union diff --git a/assets/images/BioNT_Logo.png b/assets/images/BioNT_Logo.png new file mode 100644 index 00000000000000..e383b7c808911b Binary files /dev/null and b/assets/images/BioNT_Logo.png differ diff --git a/bin/schema-slides.yaml b/bin/schema-slides.yaml index b7ff07548192f1..292e5e49b789f9 100644 --- a/bin/schema-slides.yaml +++ b/bin/schema-slides.yaml @@ -191,6 +191,7 @@ mapping: - assets/images/gga.png - assets/images/ncbi.png - /assets/images/elixir-uk.png + - /assets/images/BioNT_Logo.png - topics/ai4life/images/AI4Life-logo_giraffe-nodes.png required: true description: | diff --git a/topics/data-science/tutorials/online-resources-gene/images/BLASTresults.png b/topics/data-science/tutorials/online-resources-gene/images/BLASTresults.png new file mode 100644 index 00000000000000..e5f93da4383e49 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/BLASTresults.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/GenomeDataViewerofgeneOPN1LW.png b/topics/data-science/tutorials/online-resources-gene/images/GenomeDataViewerofgeneOPN1LW.png new file mode 100644 index 00000000000000..1936643f955166 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/GenomeDataViewerofgeneOPN1LW.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/GenomeDataViewerpage.png b/topics/data-science/tutorials/online-resources-gene/images/GenomeDataViewerpage.png new file mode 100644 index 00000000000000..be07749e1447ad Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/GenomeDataViewerpage.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/NIHresults.png b/topics/data-science/tutorials/online-resources-gene/images/NIHresults.png new file mode 100644 index 00000000000000..3c3fbe9561b516 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/NIHresults.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/Opsin1NIH.png b/topics/data-science/tutorials/online-resources-gene/images/Opsin1NIH.png new file mode 100644 index 00000000000000..75d7ffc4271fdf Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/Opsin1NIH.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/SequenceTextView.png b/topics/data-science/tutorials/online-resources-gene/images/SequenceTextView.png new file mode 100644 index 00000000000000..dc1ab647342e6b Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/SequenceTextView.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/UniProt.png b/topics/data-science/tutorials/online-resources-gene/images/UniProt.png new file mode 100644 index 00000000000000..182d71e32c0f39 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/UniProt.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/UniProt_proteins.png b/topics/data-science/tutorials/online-resources-gene/images/UniProt_proteins.png new file mode 100644 index 00000000000000..c4f4767c62c520 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/UniProt_proteins.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/dbSNPs.png b/topics/data-science/tutorials/online-resources-gene/images/dbSNPs.png new file mode 100644 index 00000000000000..49fcedb56a095d Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/dbSNPs.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/images/merged-info.png b/topics/data-science/tutorials/online-resources-gene/images/merged-info.png new file mode 100644 index 00000000000000..064d1859d35079 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-gene/images/merged-info.png differ diff --git a/topics/data-science/tutorials/online-resources-gene/slides.html b/topics/data-science/tutorials/online-resources-gene/slides.html new file mode 100644 index 00000000000000..816e45528ada5d --- /dev/null +++ b/topics/data-science/tutorials/online-resources-gene/slides.html @@ -0,0 +1,582 @@ +--- +layout: tutorial_slides +logo: "/assets/images/BioNT_Logo.png" +title: "Bioinformatics Data Types and Databases" +video: true +questions: +- "What are some of the main resources to explore bioinformatics information?" +- "How is this information represented in file formats?" +- "What type of information do these file formats convey?" +objectives: +- "Understand that the biological data is multi-layered" +- "Identify multiple sources of information in biology" +- "Describe how this different types of information are conveyed through different file formats" +time_estimation: 1h +key_points: +- "Biological data is multi-layered. E.g. the information about one gene can actually regard multiple different biological entities: the variability of its sequence, the derived protein, the diseases associated etc." +- "Consequently, several different sources of information can be identified and used to describe a biological entity, as well as several different file formats." +contributions: + authorship: + - lisanna + funding: + - biont +--- + +# Background + +- Need: digitally store biological data +- All biological data could be (and initially was) included in simple text files +- Yet, significant limitations: + - Not structured, hence not programmatically accessible + - Impossible to distinguish data (e.g. gene sequence) from metadata (e.g. annotations about location, quality, function, etc.) + +??? + +In this presentation, we'll look into the history of biological data. +Initially, all type of data was approached using simple text files, but this quickly became limiting. +Indeed, unstructured text files are not programmatically accessible and in such files it is impossible to distinguish data from metadata. +It's important to understand these limitations as they set the stage for the development of more advanced storage methods. + +--- + +## Different information in different file formats + +- In the years, different file formats have been developed to store different types of data with the relevant metadata fields +- E.g. for a biological sequence + - From the simplest, text-like file (FASTA) + - To more complex formats which include genomic features and quality annotation +- Different file formats not only to represent different levels of complexity but also different types of information +- E.g. about a protein + - From a text-like file to store the sequence (FASTA) + - To a tabular file to store the exact coordinates of each atom in the structure, hence comvey the 3D arrangement + +??? + +As time progressed, the need for more structured and accessible data storage became apparent. +Various file formats were developed to accommodate different types of biological data. +We'll explore some of these formats, ranging from simple text-like files for storing sequences to more complex ones +that include annotations, 3D structures, and genomic features. + +--- + +## Different information in different databases + +- Consequently, different resources evolved not only to store, but also represent/visualise this varied information +- These resources often have a database storing data and a web interface that allows to navigate it +- They usually represent different levels of complexity of one specific type of biological entity + - E.g. A database of protein sequences and their annotation (sequence variability, genomic location, effect of mutations, etc.) + - E.g. A database of protein structures and their annotation (3D coordinates, flexibility, methods used to resolve the structure, etc.) + +??? + +In parallel, different biological resources emerged, each designed to handle specific types of data and complexity. +These resources often consist of databases with associated web interfaces, enabling users to navigate and visualize the data effectively. + +--- + +### Definition of a biological database/resources + +- The [NAR Database Issue](https://www.oxfordjournals.org/nar/database/c/) collects publications of established databases in the field +- Collection of data (and metadata) in the related format + - structured + - searchable (indexed) + - updated periodically + - entries mapped to unique identifiers, and cross-referenced +- Includes associated software necessary for DB access, update, search, visualisation (web) + +??? + +Biological databases play a crucial role in housing and organizing biological data. +The NAR Database Issue collects publications about established databases in the field. +Requirements to be featured in this issues are to have a structured nature, searchability, regular updates, and cross-referencing capabilities. +These databases also offer software tools for accessing, updating, and visualizing the data they contain. + +--- + +## Some history + +.pull-left[ +- 1953: 3D structure of DNA (Watson, Crick, Franklin, Wilkins) +- 1956: first protein sequence, insulin (51 AA) +- 1965: first whole nucleic acid sequence, tRNA from yeast +- 1966: Atlas of protein sequences and structures, by Margaret Dayhoff, printed book +- 1972: first complete protein-coding gene, coat protein from a bacteriophage +- 1976: same Lab, its complete genome +- 1971: Protein Data Bank (PDB) +- 1980-87: the European Molecular Biology Laboratory (EMBL) Nucleotide Sequence Database; GenBank from the National Center for Biotechnology Information (NCBI); and the DNA Databank of Japan (DDJ) +- 1986: SwissProt was created by Rolf Apweiler +] + +.pull-right[ +![The dataset of PDB structures in 1973 included only 9 proteins illustrated in this image](http://cdn.rcsb.org/rcsb-pdb/v2/about-us/early.png) +] + +??? + +The source of information for this slide, which includes a short early history of biological data formats and databases evolution, is the paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4727787/ +Understanding the historical context of biological data storage helps us appreciate the progress made in the field. +- 1953: Watson and Crick famously solved the three-dimensional structure of DNA in 1953, working from crystallographic data produced by Rosalind Franklin and Maurice Wilkins +- 1956: Fred Sanger obtained the first protein sequence, of insulin (51 AA) +- 1965: Robert Holley and colleagues were able to produce the first whole nucleic acid sequence, that of alanine tRNA from Saccharomyces cerevisiae +- 1966: Atlas of protein sequences and structures, by Margaret Dayhoff, a printed book including multiple + > This book is a compilation of known protein sequences. The major ones listed are for cytochrome C and for hemoglobin alpha and beta chains. +- 1972: Walter Fiers' laboratory was able to produce the first complete protein-coding gene sequence in 1972, that of the coat protein of bacteriophage MS2 +- 1976: same Lab, its complete genome +- 1971: Protein Data Bank (PDB) +- 1980-87: the European Molecular Biology Laboratory (EMBL) Nucleotide Sequence Database; GenBank from the National Center for Biotechnology Information (NCBI); and the DNA Databank of Japan (DDJ) +- 1986: SwissProt was created by Rolf Apweiler + +--- + +# Examples of biological databases + +- SwissProt + TrEMBL = UniProtKB +- PDB +- GenBank + +??? + +Prominent biological databases that have significantly contributed to our understanding of biological entities +are for example UniProtKB, PDB, and GenBank. We will discuss their importance and the types of data they store. + +--- + +### UniProtKB + +.pull-left[ +- Swiss-Prot: Manually curated / annotated Sequence Database +- TrEMBL: Database of EMBL nucleotide translated sequences, automatically annotated + +The two databases are merged into the UniProt Knowledge Base, including information of different types about proteins. +] + +.pull-right[ +![The UniProtKB, at the time of creation of these slides, includes 596793 manually curated entries (Reviewed) in Swiss-Prot and 248272897 Unreviewed entries in TrEMBL](./images/UniProt_proteins.png) +] + +??? + + +UniProtKB is a comprehensive resource that brings together data from both Swiss-Prot and TrEMBL databases. +We'll explore how these databases are merged to create a unified knowledge base about proteins, encompassing a wide array of information. + +--- + +### PDB + +.pull-left[ +Protein Data Bank (PDB) archive of 3D structure data for biological molecules (proteins, DNA, RNA). + +Currently includes > 1TB of structure data, archived world-wide. +] + +.pull-right[ +![The wwPDB project maintains a single PDB archive distributed in the USA, Europe and Japan, and freely and publicly available to the global community](http://cdn.rcsb.org/rcsb-pdb/v2/about-us/wwpdb.png) +] + +??? + +The Protein Data Bank, or PDB, is a vital repository for 3D structure data of biological molecules. +We'll delve into the significance of PDB, its role in advancing structural biology, and the substantial volume of data it currently archives. + +--- + +### GenBank + +.pull-left[ +An annotated collection of all publicly available DNA sequences, which comprises the DNA DataBank of Japan (DDBJ), the European Nucleotide Archive (ENA), and GenBank at NCBI +] + +.pull-right[ +![A graph showing that both the number of GenBank sequences and the number of NCBI web users has been constantly growing from 1989 to 2019, reaching more than 200 millions sequences and 6 millions users.](https://www.researchgate.net/publication/343364994/figure/fig2/AS:919700666073090@1596285134479/Growth-of-GenBank-sequences-and-NCBI-web-users-through-2019-Figure-borrowed-from-the.png) +] + +??? + +GenBank stands as a critical resource for DNA sequences. It collaborates with other databases, +such as DDBJ and ENA, to provide a comprehensive collection of publicly available DNA sequences. + +--- + +## Biological knowledge + +.pull-left[ +Understanding about biological entities comes from crossing the information from/to these different resources and formats +] + +.pull-right[ +![New knowledge comes from merging and crossing different levels of information about a protein, the schema mentions: the sequence (plain, conservation), structure, genomic information (conservation, location, regulation), function.](./images/merged-info.png) +] + +??? + +An intricate web of information exists around biological entities, and understanding them involves merging +insights from various resources. A big part of some bioinformaticians' job is to integrate information from different +databases and formats to gain a holistic understanding of biological entities. + +--- + +## Features of biological databases + +- Data heterogeneity +- High volume of data +- Large scale data integration +- Data sharing / user visualisation and navigation +- Uncertainty / data quality measure needed +- Dynamic and subject to change + +??? + +Biological databases are characterized by a range of features that reflect the complexity of biological data. +Biological databases face the challenges of handling data heterogeneity, ensuring data quality, and accommodating the dynamic nature of biological information. + +--- + +## Possible classifications of biological databases + +.pull-left[ +- Data type +- Data access +- Data source +- ... +] + +-- + +.pull-right[ +**Data type** + +- Genome database +- Sequence database +- Structure database +- Pathway database +- Disease database +- ... +] + +??? + +Classifying biological databases helps us categorize and understand their diverse nature. There might be various +ways of classifying databases, such as by data type, data access, and data source. + +The world of biological data is rich with different file formats designed to accommodate diverse types of information, +including those for sequences, alignments, features/annotations, and protein structures. + + +--- + +## Possible classifications of biological databases + +.pull-left[ +- Data type +- Data access +- Data source +- ... +] + +.pull-right[ +**Data access** + +- Publicly available (browsing, downloading) +- Freely accessible and reusable under a license +- License open to certain usages (e.g. academic) +- Proprietary / commercial +- Restricted to certain people / institutions +- ... +] + +??? + + +--- + +## Possible classifications of biological databases + +.pull-left[ +- Data type +- Data access +- Data source +- ... +] + +.pull-right[ +**Data source** + +- Primary databases (GenBank, PDB) +- Secondary databases: analysed/aggregated results of the primary ones (UniProtKB) +- Composite database: non-redundant / filtered data (SwissProt) +- ... +] + +--- + +# Biological file formats + +- Sequence formats +- Alignment formats +- Features/annotations formats +- Structure formats + +??? + +In the following tutorials, we'll explore some of the most commonly used biological file formats in detail. +We'll provide examples and explanations for each format, helping you understand how they store and represent different types of biological data. + + +--- + +## Sequence formats + +**FASTA** + +File extensions: file.fa, file.fasta, file.fsa + +Example: + +```markdown +>XR_002086427.1 Candida albicans SC5314 uncharacterized ncRNA (SCR1), ncRNA + +TGGCTGTGATGGCTTTTAGCGGAAGCGCGCTGTTCGCGTACCTGCTGTTTGTTGAAAATTTAAGAGCAAAGTGTCCGGCTCGATCCCTGCGAATTGAATTCTGAACGCTAGAGTAATCAGTGTCTTTCAAGTTCTGGTAATGTTTAGCATAACCACTGGAGGGAAGCAATTCAGCACAGTAATGCTAATCGTGGTGGAGGCGAATCCGGATGGCACCTTGTTTGTTGATAAATAGTGCGGTATCTAGTGTTGCAACTCTATTTTT +``` + +??? + +Fasta format is a simple way of representing nucleotide or amino acid sequences of nucleic acids and proteins. +This is a very basic format with two minimum lines. First line referred as comment line starts with ‘>’ and gives +basic information about sequence. There is no set format for comment line. Any other line that starts with ‘;’ will +be ignored. Lines with ‘;’ are not a common feature of fasta files. After comment line, sequence of nucleic acid or +protein is included in standard one letter code. Any tabulators, spaces, asterisks etc in sequence will be ignored. + +--- + +## Sequence formats + +**FASTQ** + +File extensions: ile.fastq, file.sanfastq, file.fq + +Example: + +```markdown +@K00188:208:HFLNGBBXX:3:1101:1428:1508 2:N:0:CTTGTA +ATAATAGGATCCCTTTTCCTGGAGCTGCCTTTAGGTAATGTAGTATCTNATNGACTGNCNCCANANGGCTAAAGT ++ +AAAFFJJJJJJJJJJJJJJJJJFJJFJJJJJFJJJJJJJJJJJJJJJJ#FJ#JJJJF#F#FJJ#F#JJJFJJJJJ +``` + +??? + +Fastq format was developed by Sanger institute in order to group together sequence and its quality scores (Q: phred quality score). In fastq files each entry is associated with 4 lines. + +- Line 1 begins with a ‘@‘ character and is a sequence identifier and an optional description. +- Line 2 Sequence in standard one letter code. +- Line 3 begins with a ‘+‘ character and is optionally followed by the same sequence identifier (and any additional description) again. +- Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence. + +--- + +## Alignment formats + +**SAM (Sequence Alignment Map)** + +File extensions: file.sam + +Example: + +```markdown +1:497:R:-272+13M17D24M 113 1 497 37 37M 15 100338662 0 CGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAG 0;==-==9;>>>>>=>>>>>>>>>>>=>>>>>>>>>> XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:37 +19:20389:F:275+18M2D19M 99 1 17644 0 37M = 17919 314 TATGACTGCTAATAATACCTACACATGTTAGAACCAT >>>>>>>>>>>>>>>>>>>><<>>><<>>4::>>:<9 RG:Z:UM0098:1 XT:A:R NM:i:0 SM:i:0 AM:i:0 X0:i:4 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:37 +19:20389:F:275+18M2D19M 147 1 17919 0 18M2D19M = 17644 -314 GTAGTACCAACTGTAAGTCCTTATCTTCATACTTTGT ;44999;499<8<8<<<8<<><<<<><7<;<<<>><< XT:A:R NM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:0 XM:i:0 XO:i:1 XG:i:2 MD:Z:18^CA19 +9:21597+10M2I25M:R:-209 83 1 21678 0 8M2I27M = 21469 -244 CACCACATCACATATACCAAGCCTGGCTGTGTCTTCT <;9<<5><<<<><<<>><<><>><9>><>>>9>>><> XT:A:R NM:i:2 SM:i:0 AM:i:0 X0:i:5 X1:i:0 XM:i:0 XO:i:1 XG:i:2 MD:Z:35 +``` + +??? + +The SAM Format is a text format for storing sequence data in a series of tab delimited ASCII columns. Most often it is +generated as a human readable version of its sister BAM format, which stores the same data in a compressed, indexed, binary form. + +SAM format files are generated following mapping of the reads to reference sequence. It is TAB-delimited text format +with header and a body. Header lines start with ‘@’ while alignment lines do not. Header hold generic information on +SAM file along with version information, if the file is sorted, information on reference sequence, etc. The alignment +records constitute the body of the file. Each alignment line/record has 11 mandatory fields describing essential alignment information. + +--- + +## Alignment formats + +**BAM (Binary Alignment/Map)** + +File extensions: file.bam + +A BAM file is the compressed binary version of the Sequence Alignment/Map (SAM). + +??? + +a compact and indexable representation of nucleotide sequence alignments. The data between SAM and BAM is exactly +same. Being Binary BAM files are small in size and ideal to store alignment files. Require samtools to view the file. + +--- + +## Features/annotations formats + +**VCF (Variant Calling Format/File)** + +File extensions: file.vcf + +Example: + +```markdown +##fileformat=VCFv4.2 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +... +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 +``` + +??? + +VCF is a text file format with a header (information VCF version, sample etc) and data lines constitute the body of file. + +--- + +## Features/annotations formats + +**GFF (General Feature Format or Gene Finding Format)** + +File extensions: file.gff2, file. gff3, file.gff + +Example (GFF2): + +```markdown +browser position chr22:10000000-10025000 + +browser hide all + +track name=regulatory description="TeleGene(tm) Regulatory Regions" + +visibility=2 + +chr22 TeleGene enhancer 10000000 10001000 500 + . touch1 + +chr22 TeleGene promoter 10010000 10010100 900 + . touch1 + +chr22 TeleGene promoter 10020000 10025000 800 - . touch2 +``` + +??? + +GFF (General Feature Format or Gene Finding Format). GFF can be used for any kind of feature (Transcripts, exon, +intron, promoter, 3’ UTR, repeatitive elements etc) associated with the sequence, whereas GTF is primarily for +genes/transcripts. GFF3 is the latest version and an improvement over GFF2 format. However, many databases are +still not equipped to handle GFF3 version. The differences will be explained later in text. + +The GFF format has 9 mandatory columns and they are TAB separated. +- Col. 1 Reference Sequence +- Col. 2 Source +- Col. 3 Feature +- Col. 4 Start +- Col. 5 End +- Col. 6 Score +- Col. 7 Strand +- Col. 8 Frame (GFF2 and GTF) or Phase (GFF3) +- Col. 9 Attribute or Group field + +--- + +## Features/annotations formats + +**BED (Browser Extensible Data)** + +The BED (Browser Extensible Data) file format includes information about sequences that can be visualized in a genome +browser; a feature called an annotation track. BED files are tabs-delimited and include 12 fields (columns) of data. + +Example of fields: name of chromosome or scaffold, starting position in the chromosome, the ending position... + +--- + +## Features/annotations formats + +**PSI-MI** + +The PSI MI format is a data exchange format for molecular interactions. + +Example of fields: interaction detection method, biological role, experimental features, location of the interaction, ... + +--- + +## Features/annotations formats + +**PED** + +File extensions: file.ped + +PED is a file format for pedigree analysis, which creates a familial relationship between different samples. + +--- + +## Structure formats + +**PDB (Protein Data Bank formats)** + +File extensions: file.pdb + +PDB file formats contain atomic coordinates and are used for storing 3D protein structures by the Protein Data Bank. + +Example: + +```markdown +COMPND UNNAMED +AUTHOR GENERATED BY OPEN BABEL 2.3.2 +ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00 N +ATOM 2 CA ALA A 1 1.456 0.000 0.000 1.00 0.00 C +ATOM 3 C ALA A 1 1.930 0.000 1.463 1.00 0.00 C +ATOM 4 O ALA A 1 1.160 0.000 2.421 1.00 0.00 O +... +CONECT 101 98 +CONECT 102 94 103 +CONECT 103 102 +MASTER 0 0 0 0 0 0 0 0 103 0 103 0 +END +``` + +--- + +## Other formats + +**CSV** + +CSV (.csv file format) files stands for comma separated value and is a text file, where each line is a row and +columns are delimited with a comma. It can store different types of sequencing data and can be opened using common +spreadsheet programs. + +**JSON** + +JSON (JavaScript Object Notation) is a common file format for many other industries, but is used in a growing number +of bioinformatics applications and web resources. + +**And the list of generic file formats goes on...** + +--- + +## Why Are There So Many Different Types? + +The many different ways of generating and using biological data have given rise to the diversity previously described. +These file formats have their own specific use cases depending on: + +- Compatibility with specific software +- Data processing, parsing, and human readability needs +- Efficiency for storage + +??? + +In conclusion, the multitude of biological file formats arises from the diverse needs and characteristics of biological data. diff --git a/topics/data-science/tutorials/online-resources-gene/tutorial.bib b/topics/data-science/tutorials/online-resources-gene/tutorial.bib new file mode 100644 index 00000000000000..39c4de118f81eb --- /dev/null +++ b/topics/data-science/tutorials/online-resources-gene/tutorial.bib @@ -0,0 +1,11 @@ +@article{rangwala2021accessing, + title={Accessing NCBI data using the NCBI sequence viewer and genome data viewer (GDV)}, + author={Rangwala, Sanjida H and Kuznetsov, Anatoliy and Ananiev, Victor and Asztalos, Andrea and Borodin, Evgeny and Evgeniev, Vladislav and Joukov, Victor and Lotov, Vadim and Pannu, Ravinder and Rudnev, Dmitry and others}, + journal={Genome research}, + volume={31}, + number={1}, + pages={159--169}, + year={2021}, + publisher={Cold Spring Harbor Lab}, + doi={10.1101/gr.266932.120} +} diff --git a/topics/data-science/tutorials/online-resources-gene/tutorial.md b/topics/data-science/tutorials/online-resources-gene/tutorial.md new file mode 100644 index 00000000000000..597f7897783a14 --- /dev/null +++ b/topics/data-science/tutorials/online-resources-gene/tutorial.md @@ -0,0 +1,549 @@ +--- +layout: tutorial_hands_on +title: Learning about one gene across biological resources and formats +level: Introductory +draft: true +zenodo_link: https://zenodo.org/record/8304465 +questions: +- How to employ bioinformatics resources to investigate a specific protein family (opsins)? +- How to navigate the Genome Data Viewer to find opsins in the human genome? +- How to identify genes associated with opsins and analyze their chromosome locations? +- How to explore literature and clinical contexts for the OPN1LW gene? +- How to use protein sequences files to perform similarity searches using BLAST? +objectives: +- Starting from a text search, navigate multiple web resources to examine multiple types of information about a gene, conveyed through multiple file formats. +time_estimation: 1H +key_points: +- You can search for genes and proteins using specific text on the NCBI genome. +- Once you find a relevant gene or protein, you can obtain its sequence and annotation in various formats from NCBI. +- You can also learn about the chromosome location and the exon-intron composition of the gene of interest. +- NCBI offers a BLAST tool to perform similarity searches with sequences. +- You can further explore the resources included in this tutorial to learn more about the gene-associated conditions and the variants. +- You can input a FASTA file containing a sequence of interest for BLAST searches. +contributions: + authorship: + - lisanna + - bebatut + - teresa-m + funding: + - biont +--- + +# Introduction + +When we do a bioinformatics analysis, e.g. RNA-seq, we might end up with a list of gene names. We then need to explore these genes. But how can we do that? What are the resources available for that? And how to navigate through them? + +The aim of this tutorial is to familiarize ourselves with that, using Human opsins as an example. + +Human opsins are found in the cells of your retina. Opsins catch light and begin the sequence of signals that result in vision. We will proceed by asking questions about opsins and opsin genes, and then using different bioinformatics databases and resources to answer them. + +> +> This tutorial is a bit atypical: we will not work in Galaxy but mostly outside of it, navigating databases and tools through their own web interfaces. The scope of this tutorial is to illustrate several sources of biological data in different file formats, and representing different information. +{: .comment} + +> +> +> In this tutorial we will deal with: +> +> 1. TOC +> {:toc} +{: .agenda} + +# Searching Human Opsins + +To seach Human Opsins, we will start by checking the [NCBI Genome Data Viewer](https://www.ncbi.nlm.nih.gov/genome/gdv). The NCBI Genome Data Viewer (GDV) ({% cite rangwala2021accessing %}) is a genome browser supporting the exploration and analysis of annotated eukaryotic genome assemblies. The GDV browser displays biological information mapped to a genome, including gene annotation, variation data, BLAST alignments, and experimental study data from the NCBI GEO and dbGaP databases. GDV release notes describe new features relating to this browser. + +> Open NCBI Genome Data Viewer +> +> 1. Open the NCBI Genome Data Viewer at [www.ncbi.nlm.nih.gov/genome/gdv](https://www.ncbi.nlm.nih.gov/genome/gdv/) +> +{: .hands-on} + +The homepage includes a simple "tree of life" where the human node is highlighted because it is the default organism to search. We can change that in the *Search organisms* box but we will leave for now as we are interested in Human Opsins. + +![Genome Data Viewer home page screenshot, the word "opsin" is written in the search box and the result is previewed.](./images/GenomeDataViewerpage.png "Genome Data Viewer home page") + +The panel on the right reports multiple assemblies of the genome of interest, and a map of the chromosomes in that genome. We can search for Opsins there. + +> Search for OpsinsOpen NCBI Genome Data Viewer +> +> 1. Type `opsin` in the *Search in genome* box +> 2. Click on the magnifier icon or press Enter +> +{: .hands-on} + +Below the box is now displayed a table with genes related to opsin together with their names and location, i.e. the chromosome number, as well ass the start and end position + +In the list of genes related to the search term opsin, there are the rhodopsin gene (RHO), and three cone pigments, short-, medium-, and long-wavelength sensitive opsins (for blue, green, and red light detection). There are other entities, e.g. a -LCR (Locus Control region), putative genes and receptors. + +Multiple hits are on the X chromosome, one of the sex-determining chromosomes. + +> +> +> 1. How many genes have been found in Chromosome X? +> 2. How many are protein coding genes? +> +> > +> > +> > 1. The hits in ChrX are: +> > - OPSIN-LCR +> > - OPN1LW +> > - OP1MW +> > - OPN1MW2 +> > - OPN1MW3 +> > +> > 2. By hovering over each gene, a box open and we can click on *Details* to learn more about each gene. Then we learn that the first (OPSIN-LCR) is not protein coding but a gene regulatory region and the other are protein coding genes. So there are 4 protein coding genes related to opsins in Chromosome X. In particular, Chromosome X includes one red pigment gene (OPN1LW) and three green pigment genes (OPN1MW, OPN1MW2 and OPN1MW3 in the reference genome assembly). +> > +> {: .solution} +{: .question} + +Let's now focus on one specific opsin, the gene OPN1LW. + +> Open Genome Browser for gene OPN1LW +> +> 1. Click on the blue arrow that appears in the results table when you hover your mouse on the OPN1LW row +> +{: .hands-on} + +You should have landed in [this page](https://www.ncbi.nlm.nih.gov/genome/gdv/browser/genome/?id=GCF_000001405.40), that is the genome view of gene OPN1LW. + +![Genome Data Viewer of gene OPN1LW, screenshot of the two main panels of the viewer, with chromosomes on the left and the feature viewer on the right.](./images/GenomeDataViewerofgeneOPN1LW.png "Genome Data Viewer of gene OPN1LW") + +There is a lot of information in this page, let's focus on one section at the time. + +1. The Genome Data Viewer, on the top, tells us that we are looking at the data from the organism `Homo sapiens`, assembly `GRCh38.p14` and in particular at `Chr X` (Chromosome X). Each of these information has a unique ID. +2. The entire Chromosome is represented directly below, and the positions along the short (`p`) and long (`q`) arms are highlihgted. +3. Below, a blue box highlights that we are now focusing on the Region corresponding to the Gene `OPN1LW`. + + There are multiple ways to interact with the viewer below. Try for example to hover with the mouse on the dots representing exons in the blue box. + +4. In the graph below, the gene requence is a green line with the exons (protein coding fragments) represented by green rectangles. + + Hover with the mouse on the green line corresponding to `NM_020061.6` (our gene of interest) to get more detailed information. + + > + > + > 1. What is the location of the OPN1LW segment? + > 2. What is the length of the OPN1LW segment? + > 3. What are introns and exons? + > 4. How many exons and introns are in the OPN1LW gene? + > 5. What is the total length of the coding region? + > 6. What is the distribution between coding and non coding regions? What does that mean in term of biology? + > 7. What is the lenght of the protein in number of amino acids? + > + > > + > > + > > 1. From 154,144,243 to 154,159,032 + > > 2. 1,4790 nucleotides, found at *Span on 14790 nt, nucleotides)* + > > 3. Eukaryotic genes are often interrupted by non-coding regions called intervening sequences or introns. The coding regions are called exons. + > > 4. From this diagram, you can see that the OPN1LW gene consists of 6 exons and 5 introns, and that the introns are far larger than the exons. + > > 5. The CDS length is 1,095 nucleotides. + > > 6. Of the 14790 nt in the gene, only 1095 nt code for protein, which means that less than 8% of the base pairs contain the code. When this gene is expressed in cells in the human retina, an RNA copy of the entire gene is synthesized. Then the intron regions are cut out, and the exon regions joined together to produce the mature mRNA (a process called splicing). which will be translated by ribosomes as they make the red opsin protein. In this case, 92% of the initial RNA transcript is tossed out, leaving the pure protein code. + > > 7. The length of the resulting protein is 364 aa, amino acids. + > {: .solution} + {: .question} + +But what is the sequence of this gene? There are multiple ways to retrieve this information, we will go through what we think is one of the most intituitive. + +> Open Genome Browser for gene OPN1LW +> +> 1. Click on the {% icon tool %} *Tools* section on the top right of the box showing the gene +> 2. Click on *Sequence Text View* +{: .hands-on} + +This panel reports the DNA sequence of the introns (in green), as well as the one of the exons (in pink, including the translated protein sequence below). + +![Screenshot of the sequence view of the NHI resource, text is highlighted in different colors.](./images/SequenceTextView.png "Sequence Text View") + +This sequence box is not showing the entire gene at the moment, but a subsequence of it. You can move upstream and downstream the genetic code with the arrows *Prev Page* and *Next Page*, or start from a specific position with the *Go To Position* button. We suggest to start with the start of the coding part of the gene, which as we learned earlier is at position 154,144,243. + +> Go to a specific position in Sequence View +> +> 1. Click on *Go To Position* +> 2. Type on `154144243` +> +> We have to remove the commas to validate the value +{: .hands-on} + +The sequence highlighted in purple here signals a regulatory region. + +> +> +> 1. What is the first amino acid of the resulting protein product? +> 2. What is the last one? +> 3. Can you keep a note of the first three and last three AAs of this protein? +> +> > +> > +> > 1. The correspondent protein starts with Methionine, M (they all do). +> > 2. The last AA of the last exon (found in the 2nd page) is Alanine (A). After that, the stop codon TGA comes, which is not translated into an AA. +> > 3. The first three AAs are: M,A,Q; the last three: S,P,A. +> > +> {: .solution} +{: .question} + +We can now close the *Sequence View*. + +From this resource, we can also get files, in different format, describing the gene. They are available from the *Download* section. + +1. *Download FASTA* will allow us to download the simplest file format to represent the nucleotide sequence of all the visible range of the genome (longer than the gene only). +2. *Download GenBank flat file* will allow us to access the annotation avaible on this page (and beyond) in a flat text format. +3. *Download Track Data* allows us to inpect two of the file formats we presented in the slides: the GFF (GFF3) and BED formats. If you change the tracks, each one may or may not be available. + +# Finding more information about our gene + +Let's now get an overview of the information we have (in the literature) about our gene, using the NCBI resources + +> Go to a specific position in Sequence View +> +> 1. Open the NCBI search at [www.ncbi.nlm.nih.gov/search](https://www.ncbi.nlm.nih.gov/search/) +> 2. Type `OPN1LW` in the *Search NCBI* search box +> +{: .hands-on} + +![Screenshot of the NIH result page, with cards named Literature, Genes, Proteins, genomes, Clinical and PubChem](./images/NIHresults.png "Results when searching for `OPN1LW` on NCBI"). + +## Literature + +Let's start with the literature and in particular *PubMed* or *PubMed Central* results + +> What's the difference between PubMed and PubMed Central? +> +> PubMed is a biomedical literature database which contains the abstracts of publications in the database. +> +> PubMed Central is a full text repository, which contains the full text of publications in the database. +> +> While the exact number of hits may vary in time from the screenshot above, any gene name should have more hits in PubMed Central (searched in the full texts of publications) than in PubMed (searched only in the abstracts). +> +{: .details} + +> Open PubMed +> +> 1. Click on *PubMed* in the *Literature* box +> +{: .hands-on} + +You have entered PubMed, a free database of scientific literature, to the results of a complete search for articles directly associated with this gene locus. + +By clicking on the title of each article, you can see abstracts of the article. If you are on a university campus where there is online access to specific journals, you might also see links to full articles. PubMed is your entry point to a wide variety of scientific literature in the life sciences. On the left side of any PubMed page, you will find links to a description of the database, help, and tutorials on searching. + +> +> +> 1. Can you guess which type of conditions are associated to this gene? +> +> > +> > +> > 1. We will answer this question later +> > +> {: .solution} +{: .question} + +> Back to NCBI Search page +> +> 1. Go back the [NCBI Search page](https://www.ncbi.nlm.nih.gov/search/all/?term=OPN1LW) +> +{: .hands-on} + +## Clinical + +Let's now focus on the *Clinical* box, and specially on *OMIM*. OMIM, the Online Mendeliam Inheritance in Man (and woman!), is a catalog of human genes and genetic disorders. + +> Open OMIM +> +> 1. Click on *OMIM* in the *Clinical* box +> +{: .hands-on} + +Each OMIM entry is a genetic disorder (here mostly types of colorblindness) associated with mutations in this gene. + +> Read as much as your interest dictates +> +> 1. Follow links to get more information about each entry +> +{: .hands-on} + +> Read as much as your interest dictates +> +> For more information about OMIM itself, click the OMIM logo at the top of the page. Through OMIM, a wealth of information is available for countless genes in the human genome, and all information is backed up by references to the latest research articles. +> +{: .comment} + +How do variations in the gene affect the protein product, and its functions? Let's go back to the NIH page and investigate access the list of Single Nucleotide Polymorphisms (SNPs) that were detected by genetics studies in the gene. + +> Open dbSNP +> +> 1. Go back the [NCBI Search page](https://www.ncbi.nlm.nih.gov/search/all/?term=OPN1LW) +> 2. Click on *dbSNP* in the *Clinical* box +> +{: .hands-on} + +![Screenshot of the dbSNPs page about gene OPN1LW. Three main panels, the one on the left to filter the search based on tags, the central showing results, the right for a more detailed and programmatic search.](./images/dbSNPs.png "dbSNP in OPN1LW") + +> +> +> 1. What is the clinical significance of the rs5986963 and rs5986964 (first 2 variants listed at the time of creation of this tutorial)? +> 2. What is the functional consequence of rs104894912? +> 3. What is the functional consequence of rs104894913? +> +> > +> > +> > 1. The Clinical significance is `benign` so it seems that they have no effect on the final protein product +> > 2. rs104894912 mutation leads to a `stop_gained` variant, which truncates the resulting protein too early and is therefore `pathogenic` +> > 3. rs104894913 mutation leads to a `missense_variant`, also `pathogenic`. +> > +> {: .solution} +{: .question} + +Let's investigate about more on the rs104894913 variant + +> Learn more about a variant dbSNP +> +> 1. Click on `rs104894913` to open its [dedicated page](https://www.ncbi.nlm.nih.gov/snp/rs104894913) +> 2. Click on *Clinical Significance* +> +> > +> > +> > What type of condition is associated with the rs104894913 variant? +> > +> > > +> > > +> > > The name of the associated disease is "Protan defect". A quick internet search with your search engine will clarify that this is a type of color blindness. +> > > +> > {: .solution} +> {: .question} +> +> 3. Click on the *Variant details* +> +> > +> > +> > 1. Which substitution is associated with this variant? +> > 2. What is impact of this subtitution in term of codon and amino acid? +> > 3. At which position of the protein is this substitution? +> > +> > > +> > > +> > > 1. The substitution `NC_000023.10:g.153424319G>A` corresponds to change from a Guanine (G) to an Adenine (A) +> > > 2. This substitution change the codon `GGG`, a Glycine, into `GAG`, a Glutathione +> > > 3. `p.Gly338Glu` means that the substitution is at position 338 of the protein. +> > {: .solution} +> {: .question} +{: .hands-on} + +What does this mean this substitution for the protein? Let's have a deeper look at this protein. + +## Protein + +> Open Protein +> +> 1. Go back the [NCBI Search page](https://www.ncbi.nlm.nih.gov/search/all/?term=OPN1LW) +> 2. Click on *Protein* in the *Proteins* box +> 3. Click on `OPN1LW – opsin 1, long wave sensitive` in the box on top +> +{: .hands-on} + +![Screenshot of the Opsin 1 NIH protein page, two main panels. The one on the left reporting information about the gene, the one on the right is a table of content and a series of links to other resources](./images/Opsin1NIH.png "Opsin 1 NIH protein page") + +This page presents once again some data that we are familiar with (e.g. distribution of the exons along the gene sequence). + +> Download the protein sequences +> +> 1. Click on *Download Datasets* +> 2. Select +> - `Gene Sequences (FASTA)` +> - `Transcript sequences (FASTA)` +> - `Protein sequences (FASTA)` +> 3. Click on *Download* button +> 4. Open the downloaded ZIP file +{: .hands-on} + +> +> +> 1. What does the folder contain? +> 2. Do you think they implemented good data practices? +> +> > +> > +> > 1. The folder includes +> > - a folder `ncbi_datasets` with different subfolders in it leadig some data files (multiple formats), +> > - a `README.md` (a Markdown file), which is designed to "travel" together with the data and explain how was the data retreived, what is the structure of the data containing subfolder, and where to find extensive documentation. +> > 2. It is definitely a good data management practice to guide users (not only your collaborators, but also yourself in the not-so-far future, when you will forget where does that file in your Downloads folder come from) to the data source and the data structure. +> > +> {: .solution} +{: .question} + +# Searching by sequence + +What could we do with these sequences that we just downloaded? Let's assume that we just sequenced the transcripts that we isolated through an experiment - so we know the sequence of our entity of interest, but don't know what is it. What we need to do in this case is to search the entire database of sequences known to science and match our unknown entity with an entry that has some annotation. Let's do it. + +> Search the protein sequence against all protein sequences +> +> 1. Open (with the simplest text editor you have installed) the `protein.faa` file that you jsut downloaded. +> 2. Copy its content +> 3. Open BLAST [blast.ncbi.nlm.nih.gov](https://blast.ncbi.nlm.nih.gov/Blast.cgi) +> 4. Click on the `Protein BLAST, protein > protein` +> +> We will indeed use a protein sequence to search against a database of proteins +> +> 5. Paste the protein sequence into the big text box +> 6. Check the rest of parameters +> 7. Click the blue button `BLAST` +{: .hands-on} + +This phase will take some time, there is afterall some server somewhere that is comparing the entirety of known sequences to your target. When the search is complete, the result should look similar to the one below: + +![Screenshot of BLAST results, one big header on top and the results listed as a table on bottom](./images/BLASTresults.png "BLAST results") + +> Graphic Summary of the protein sequences +> +> 1. Click on the tab *Graphic Summary* +{: .hands-on} + +We access a box containing lots of colored lines. Each line represents a hit from your blast search. If you click on a red line, the narrow box just above the box gives a brief description of the hit. + +> Descriptions of the protein sequences +> +> 1. Click on the tab *Descriptions* +{: .hands-on} + +> +> +> 1. What is the first hit? Is it expected? +> 2. What are the other hits? For which organisms? +> +> > +> > +> > 1. The first hit is our red opsin. That's encouraging, because the best match should be to the query sequence itself, and you got this sequence from that gene entry. +> > 2. Other hits are other opsins. They include entries from other primates (e.g. `Pan troglogytes`). +> > +> > +> {: .solution} +{: .question} + +The hits are for our red opsin in human but also other opsins in other primates. We could want that, for example if we wanted to use this data to build a phylogenetic tree. If we instead are pretty sure that our sequence of interest is human, we could also have filtered the search only in human sequences. + +> Filter a BLAST Search +> +> 1. Click on *Edit Search* +> 2. Type `Homo sapien` in *Organism* field +> 3. Click the blue button `BLAST` +{: .hands-on} + +With this new search, we find the other opsins (green, blue, rod-cell pigment) in the list. Other hits have lower numbers of matching residues. If you click on any of the colored lines in the *Graphic Summary*, you'll open more information about that hit, and you can see how much similarity each one has to the red opsin, our original query sequence. As you go down the list, each succeeding sequence has less in common with red opsin. Each sequence is shown in comparison with red opsin in what is called a pairwise sequence alignment. Later, you'll make multiple sequence alignments from which you can discern relationships among genes. + +> More details on BLAST scores +> +> The displays contain two prominent measures of the significance of the hit: +> +> 1. the BLAST Score - lableled Score (bits) +> +> The BLAST Score indicates the quality of the best alignment between the query sequence and the found sequence (hit). The higher the score, the better the alignment. Scores are reduced by mismatches and gaps in the best alignment. Calculation of the score is complex, involving a substitution matrix, which is a table that assigns a score to each pair of residues aligned. The most widely used matrix for protein alignment is known as BLOSUM62. +> +> 2. the Expectation Value (labeled Expect or E) +> +> The expectation value E of a hit tells whether the hit is likely be result from chance likeness between hit and query, or from common ancestry of hit and query. () +> +> > Filter a BLAST Search +> > +> > If E is smaller than $$10\mathrm{e}{-100}$$, it is sometimes given as 0.0. +> {: .comment} +> +> The expectation value is the number of hits you would expect to occur purely by chance if you searched for your sequence in a random genome the size of the human genome. +> +> $$E = 25$$ means that you could expect to find 25 matches in a genome of this size, purely by chance. So a hit with $$E = 25$$ is probably a chance match, and does not imply that the hit sequence shares common ancestry with your search sequence. +> +> Expectation values of around 0.1 may or may not be biologically significant (other tests would be needed to decide). +> +> But very small values of E mean that the hit is biologically significant. The correspondence between your search sequence and this hit must arise from common ancestry of the sequences, because the odds are are simply too low that the match could arise by chance. For example, $$E = 10\mathrm{e}{-18}$$ for a hit in the human genome means that you would expect only one chance match in one billion billion different genomes the same size of the human genome. +> +> The reason we believe that we all come from common ancestors is that massive sequence similarity in all organisms is simply too unlikely to be a chance occurrence. Any family of similar sequences across many organisms must have evolved from a common sequence in a remote ancestor. +> +{: .details} + +> Dowloading +> +> 1. Click on *Descriptions* tab +> 2. Click at any sequence hit +> 3. Click on *Download* +> 4. Select `FASTA (aligned sequences)` +{: .hands-on} + +It will download a new, slightly different, type of file: an aligned FASTA. If you want, explore it before the next section. + +While in the previous sections of this tutorial we extensively used the web interfaces of the tools (genomic viewers, quick literature scanning, reading annotations, etc.), this BLAST search is an example of a step that you could fully automate with Galaxy. + +> Similarity search with BLAST in Galaxy +> +> 1. Create a new history for this analysis +> +> {% snippet faqs/galaxy/histories_create_new.md %} +> +> 2. Rename the history +> +> {% snippet faqs/galaxy/histories_rename.md %} +> +> 3. Import the protein sequence via link from [Zenodo]({{ page.zenodo_link }}) or Galaxy shared data libraries: +> +> ```text +> {{ page.zenodo_link }}/files/protein.faa +> ``` +> +> {% snippet faqs/galaxy/datasets_import_via_link.md %} +> +> {% snippet faqs/galaxy/datasets_import_from_data_library.md %} +> +> 1. {% tool [NCBI BLAST+ blastp](toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastp_wrapper/2.10.1+galaxy2) %} with the following parameters: +> - *"Protein query sequence(s)"*: `protein.faa` +> - *"Subject database/sequences"*: `Locally installed BLAST database` +> - *"Protein BLAST database"*: `SwissProt` +> +> To search against only annotated sequences in UniProt, we need select the latest release of `SwissProt` +> +> - *"Set expectation value cutoff"*: `0.001` +> - *"Output format"*: `Tabular (extended 25 columns)` +> +{: .hands_on} + +> +> +> Do you think we are looking at exactly the same results than our original search for `opsin` in [www.ncbi.nlm.nih.gov/genome/gdv](https://www.ncbi.nlm.nih.gov/genome/gdv/)? Why? +> +> > +> > +> > The results might be similar, but there are definitely some differences. Indeed, not only a text search is different than a sequence search in terms of method, but also this second round we started from the sequence of one specific opsin, so one branch of the entire protein family tree. Some of the family members are more similar between each other, so this type of search looks at the whole family from a quite biased perspective. +> > +> {: .solution} +{: .question} + +# More information about our protein + +So far, we explored this information about opsins: +- how to know which proteins of a certain type exist in a genome, +- how to know where they are along the genome, +- how to get more information about a gene of interest, +- how to download their sequences in different formats, +- how to use these files to perform a similarity search. + +You might be curious about how to know more about the proteins they code for, now. We have already collected some information (e.g. diseases associated), but in the next steps we will cross it with data about the protein structure, localisation, interactors, functions, etc. + +The portal to visit to obtain all information about a protein is [UniProt](https://www.uniprot.org/). We can search it using a text search, or the gene or protein name. Let's go for our usual `OPN1LW` keyword. + +> Searching on UniProt +> +> 1. Open [UniProt](https://www.uniprot.org/) +> 2. Type `OPN1LW` in the search bar +> 3. Select the card view +{: .hands-on} + +The first hit should be `P04000 · OPSR_HUMAN`. Before opening the page, two things to notice: + +1. The name of the protein `OPSR_HUMAN` is different than the gene name, as well as their IDs are. +2. This entry has a golden star, which means that was manually annotated and curated. + +> Open a result on UniProt +> +> 1. Click on `P04000 · OPSR_HUMAN` +{: .hands-on} + +![Screenshot of the UniProt entry page header](./images/UniProt.png "UniProt page") + +This is a long page with a lot of information, we designed an [entire tutorial]({% link topics/data-science/tutorials/online-resources-protein/tutorial.md %}) to go through it. diff --git a/topics/data-science/tutorials/online-resources-protein/images/UniProt.png b/topics/data-science/tutorials/online-resources-protein/images/UniProt.png new file mode 100644 index 00000000000000..182d71e32c0f39 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-protein/images/UniProt.png differ diff --git a/topics/data-science/tutorials/online-resources-protein/images/complexDB.jpeg b/topics/data-science/tutorials/online-resources-protein/images/complexDB.jpeg new file mode 100644 index 00000000000000..1270f87c505224 Binary files /dev/null and b/topics/data-science/tutorials/online-resources-protein/images/complexDB.jpeg differ diff --git a/topics/data-science/tutorials/online-resources-protein/tutorial.md b/topics/data-science/tutorials/online-resources-protein/tutorial.md new file mode 100644 index 00000000000000..caabaded531c29 --- /dev/null +++ b/topics/data-science/tutorials/online-resources-protein/tutorial.md @@ -0,0 +1,528 @@ +--- +layout: tutorial_hands_on +title: One protein along the UniProt page +level: Introductory +draft: true +zenodo_link: '' +questions: +- How can you search for proteins using text, gene, or protein names? +- How do you interpret the information at the top of the UniProt entry page? +- What types of information can you expect from different download formats, such as FASTA and JSON? +- How is the function of a protein like opsins described in the "Function" section? +- What structured information is found in the "Names and Taxonomy", "Subcellular location", "Disease & Variants", "PTM/Processing" sections? +- How to learn about the protein expression, interactions, structure, family, sequence and similar proteins? +- How do the "Variant viewer" and "Feature viewer" tabs assist in mapping protein information along the sequence? +- What does the "Publications" tab list, and how can you filter publications? +- What is the significance of tracking entry annotation changes over time? +objectives: +- By exploring protein entries in UniProtKB, interpret protein function, taxonomy, structure, interactions, variants, and more. +- Use unique identifiers to connect databases, download gene and protein data, visualize and compare sequence features. +time_estimation: 1H +key_points: + - How to navigate UniProtKB entries, accessing comprehensive details about proteins, such as their functions, taxonomy, and interactions + - The Variant and Feature viewer are your tools to visually explore protein variants, domains, modifications, and other key sequence features. + - Expand your understanding by utilizing external links to cross-reference data and uncover complex relationships. + - Explore the History tab for access to previous versions of entry annotations. +requirements: +- + type: "internal" + topic_name: data-science + tutorials: + - online-resources-gene +contributions: + authorship: + - lisanna + - bebatut + funding: + - biont +--- + +# Introduction + +When doing a biological data analysis, we might end up with some interesting proteins, that we need explore these genes. But how can we do that? What are the resources available for that? And how to navigate through them? + +The aim of this tutorial is to familiarize ourselves with that, using Human opsins as an example. + +> +> This tutorial is a bit atypical: we will not work in Galaxy but mostly outside of it, in the [UniProt](https://uniprot.org) database pages. +{: .comment} + +> +> This tutorial designed to be the continuation of the tutorial ["One gene across file formats"]({% link topics/data-science/tutorials/online-resources-gene/tutorial.md %}), but it can also be consulted as a stand-alone module. +{: .comment} + +Opsins are found in the cells of your retina. They catch light and begin the sequence of signals that result in vision, and that is the reason why, when compromised, they are associated to color-blindness and other visual impairments. + +> Sources of information from this tutorial +> +> The tutorial you are consulting was mainly developed consulting UniProtKB resources, in particular the [Explore UniProtKB entry](https://www.uniprot.org/help/explore_uniprotkb_entry) tutorial. +> Some sentences are reported from there with no modifications. +> +> In addition, the topic was chosen based Gale Rhodes' [Bioinformatics Tutorial](https://spdbv.unil.ch/TheMolecularLevel/Matics/index.html). Although the tutorial cannot be anymore be followed step by step due to how the mentioned resources changed in time, it could provide additional insights on opsins and in particular how one could build structural models of proteins based on evolutionary information. +> +{: .comment} + + +> +> +> In this tutorial we will deal with: +> +> 1. TOC +> {:toc} +{: .agenda} + +# The UniProtKB Entry page + +The portal to visit to obtain all information about a protein is [UniProtKB](https://www.uniprot.org/). We can search it using a text search, or the gene or protein name. Let's try first a set of generic keywords, like `Human opsin`. + +> Search for Human opsin on UniProtKB +> +> 1. Open the [UniProtKB](https://www.uniprot.org/) +> 2. Type `Human opsin` in the search bar +> 3. Launch the search +> +{: .hands-on} + +> +> +> How many results did we get? +> +> > +> > +> > 410 results (at the time of preparation of this tutorial) +> > +> {: .solution} +{: .question} + +These 410 results give us the feeling that we need to be more specific (although - spoiler - our actual target is among the first hits). + +To be specific enough, we suggest to use an unique identifier. From the [previous tutorial]({% link topics/data-science/tutorials/online-resources-gene/tutorial.md %}) we know the gene name of the protein that we are looking for, `OPN1LW`. + +> Search for OPN1LW on UniProtKB +> +> 1. Type `OPN1LW` in the top search bar +> 2. Launch the search +> +{: .hands-on} + +> +> +> 1. How many results did we get? +> 2. What should we do to reduce this number? +> +> > +> > +> > 1. 200+ results (at the time of preparation of this tutorial) +> > 2. We need to clarify what we are looking for: Human OPN1LW +> {: .solution} +{: .question} + +We need to add `Human` to clarify what we are looking for. + +> Search for Human OPN1LW on UniProtKB +> +> 1. Type `Human OPN1LW` in the top search bar +> 2. Launch the search +> +{: .hands-on} + +> +> +> 1. How many results did we get? +> 2. Do we have one result including OPN1LW as gene name? +> +> > +> > +> > 1. 7 results (at the time of preparation of this tutorial) +> > 2. The first result is labeled with `Gene: OPN1LW (RCP)` +> {: .solution} +{: .question} + +The first result, labeled with `Gene: OPN1LW (RCP)`, is our target, `P04000 · OPSR_HUMAN`. Before opening the page, two things to notice: + +1. The name of the protein `OPSR_HUMAN` is different than the gene name, as well as their IDs are. +2. This entry has a golden star, which means that was manually annotated and curated. + +# Inspecting a UniProt entry + +> Open a result on UniProt +> +> 1. Click on `P04000 · OPSR_HUMAN` +{: .hands-on} + +![Screenshot of the UniProt entry page header](./images/UniProt.png "UniProt page") + +To navigate this long page, the menu (navigation bar) on the left will be extremely useful. Just from it, we understand that this database contains information about the entry on: +- the know functions, +- the taxonomy, +- the location, +- variants and associated diseases, +- Post Translational Modification (PTMs), +- the expression, +- the interactions, +- the structure, +- the domains and their classification, +- the sequences +- similar proteins. + +The navigation bar stays in the same place on the screen as you move up and down in an entry, so that you can quickly navigate to sections of interest. We will consult all the mentioned sections separately, but let's first focus on the headers on the left. + +At the top of the page, you can see the UniProt entry accession and name, the name of the protein and gene, the organism, whether the protein entry has been manually reviewed by a UniProt curator, its annotation score, and the evidence level for its existence. + +Below the main header, you find a series of tabs (*Entry*, *Variant viewer*, *Feature viewer*, *Publications*, *External links*, *History*). The tabs allow you to switch between the entry, a graphical view of sequence features (Feature viewer), publications, and external links, but ignore them for the moment and do not move from the *Entry* tab. + +## Entry + +The next menu is already part of the *Entry* tab. It allows us to run a BLAST sequence similarity search on the entry, align it with all its isoforms, download the entry in various formats, or add it to the basket to save for later. + +> +> +> 1. What are the available formats in the *Download* drop-down menu? +> 2. What type of information would we download through these file formats? +> +> > +> > +> > 1. The formats are: `Text`, `FASTA (canonical)`, `FASTA (canonical & isoform`, `JSON`, `XML`, `RDF/XML`, `GFF` +> > 2. The`FASTA` formats should sound familiar (after the preliminary tutorial), and include the protein sequence, eventually with its isoforms (in which case it will be a multi-FASTA). Besides these, all the other formats are not protein- or even biology-specific. These are general file formats extensively used by websites to include the information included in the page. Hence, by downloading `text` (or even better the `json`) file, we would download the same annotation that we access in this page, but in a format that is easier to parse programmatically. +> > +> {: .solution} +{: .question} + +Let's now scroll the entry page, section by section. + +### Function + +This section summarises the functions of this protein as follows: + +> Visual pigments are the light-absorbing molecules that mediate vision. They consist of an apoprotein, opsin, covalently linked to cis-retinal. + +Regardless of the level of details that you understand (depending on your background), this is impressively short and specific considering the enourmous amount of literature and studies that exist beyond the determination of a protein function. Anyway, someone did the work for us, and this protein is already fully classified in the Gene Ontology (GO), which describes any classified protein's molecular function, biological process and cellular component. + +GO is a perfect example of database / resource that builds on a very complex universe of knowledge and translates it into a simpler graph, at the risk of losing details. This has the great advantage of organising the information, making it countable and analysable and programmatically accessible, ultimately allowing us to have this long summary pages and KnowledgeBases. + +> +> +> 1. To which molecular functions is this protein annotated? +> 2. To which cellular components is this protein annotated? +> 3. To which biological processes this protein annotated? +> +> > +> > +> > 1. Photoreceptor protein, G-protein coupled receptor +> > 2. Photoreceptor disc membrane +> > 2. Sensory transduction, Vision +> > +> {: .solution} +{: .question} + +### Names and Taxonomy + +Other examples of structured information are available in the next section, e.g. in the taxonomy. This section also reports other unique identifiers that refer to the same biological entity or to linked entities (e.g. associated diseases in the `MIM` menu). + +> +> +> 1. What is the taxonomic identifier associated with this protein? +> 2. What is the proteome identifier associated with this protein? +> +> > +> > +> > 1. 9606, i.e. Homo sapiens +> > 2. UP000005640, componenent of Chromosome Xs +> {: .solution} +{: .question} + +## Subcellular location + +We already know where our protein is in the human body (in the retina, as specified by the function summary), but where is it in the cell? + +> +> +> 1. Where is our protein in the cell? +> 2. Is it coherent with the GO annotation observed before? +> +> > +> > +> > 1. The section explains that it is a "Multi-pass membrane protein", which means that it is a protein inserted in the cell membrane and goin through it multiple times. +> > 2. The GO Annotation on the top mentions that we are referring to the photoreceptor (cell) membrane in particular. +> {: .solution} +{: .question} + +The Subcellular location section includes a *Features* area detailing which sections, along the protein sequence, are inserted in the membrane (Transmembrane) and which not (Topological domain). + +> +> +> How many Transmembrane domains and Topological domains are there? +> +> > +> > +> > 8 Transmembrane and 7 Topological domain +> {: .solution} +{: .question} + +### Disease & Variants + +As we know from the previous tutorial, this gene/protein is associated to multiple diseases. This section details this association also listing the specific variants that have been detected as disease-related. + +> +> +> What types of scientific studies allow to assess the association of a genetic variant to a diseases? +> +> > +> > +> > Three commonly used methods for assessing the association of a genetic variant with a disease are: +> > +> > - Genome-Wide Association Studies (GWAS) +> > +> > GWAS are widely used to identify common genetic variants associated with diseases. They involve scanning the entire genome of a large number of individuals to identify variations linked to a particular disease or trait. +> > +> > - Case-Control Studies +> > +> > Case-control studies are frequently employed to compare individuals with a disease to those without it, focusing on the presence or frequency of specific genetic variants in both groups. +> > +> > - Family Studies +> > +> > Family-based studies involve analyzing genetic variants within families where multiple members are affected by a disease. By studying the inheritance patterns of genetic variants and their association with the disease within families, researchers can identify potential disease-associated genes. +> > +> > +> > These type of studies would imply extensive usage of the file types to manage genomic data, such as: SAM (Sequence Alignment Map), BAM (Binary Alignment Map), VCF (Variant Calling Format) etc. +> > +> {: .solution} +{: .question} + +Also this section includes a *Features* area, where the natural variants are mapped along the sequence. Below, it also highlights that a more detailed view of features along the sequence is provided in the *Disease & Variants* tab, but let's not open it for now. + +### PTM/Processing + +A post-translational modification (PTM) is a covalent processing event resulting from a proteolytic cleavage or from the addition of a modifying group to one amino acid. + +> +> +> What are Post-translational modifications for our protein? +> +> > +> > +> > Chain, glycosylation, disulfide bond, modified residue +> {: .solution} +{: .question} + +### Expression + +We already know where the protein is in the cell, but for human proteins we often have information on where it is in the human body, i.e. in which tissues. This information can come from the Human [ExpressionAtlas](https://www.ebi.ac.uk/gxa/home) or other similar resources. + +> +> +> In which tissue is the protein found? +> +> > +> > +> > The three color pigments are found in the cone photoreceptor cells. +> {: .solution} +{: .question} + +### Interaction + +Proteins perform their function through their interaction with the surroundings, in particular with other proteins. This section reports the interactors of our protein of interest, in a table that we can also filter by subcellular location, diseases, and type of interaction. + +The source of this information are databases like STRING, and the entry page for our protein is directly linked from this section. + +> Search for Human OPN1LW on UniProtKB +> +> 1. Click on the [STRING link](https://string-db.org/network/9606.ENSP00000358967) in a different tab +> +{: .hands-on} + +> +> +> 1. How many different file formats can you download from there? +> 2. What kind of information will be conveyed in each file? +> +> > +> > +> > STRING provides data in downloadable file formats to support further analyses. +> > The primary file format used by STRING is the 'TSV' (Tab-Separated Values) format, which presents protein interaction data in a structured, tabular layout. This format is well-suited for easy integration into various data analysis tools and software. +> > Additionally, STRING offers data in PSI-MI (Proteomics Standards Initiative Molecular Interactions) XML format, a standard for representing protein interaction data that allows for compatibility with other interaction databases and analysis platforms. These file formats enable researchers to harness the wealth of protein interaction information in STRING for their own studies and analyses. Researchers can also download visual representations of protein networks in image formats such as PNG and SVG, which are suitable for presentations and publications. For advanced analysis, STRING offers 'flat files' containing detailed interaction information, and 'MFA' (Multiple Alignment Format) files, which are useful for comparing multiple protein sequences. These diverse downloadable file formats enable researchers to harness the wealth of protein interaction information in STRING for their own studies and analyses. +> > +> {: .solution} +{: .question} + +### Structure + +Are you curious about the intricate three-dimensional structures of proteins? The *Structure* section on the UniProtKB entry page is your gateway to exploring the fascinating world of protein architecture. + +In this section, you'll find information about experimentally determined protein structures. These structures provide crucial insights into how proteins function and interact with other molecules. You will discover interactive views of the protein's structure that you can explore directly within the UniProtKB entry. This feature provides an engaging way to navigate through the protein's domains, binding sites, and other functional regions. By delving into the *Structure* section, you'll gain a deeper understanding of the physical basis of protein function and discover the wealth of information that structural data can unlock. + +> +> +> 1. What is the variant associated to Colorblindess? +> 2. Can you find that specific amino acid in the structure? +> 3. Can you formulate a guess of why this mutation is distruptive? +> +> > +> > +> > 1. In the *Disease & Variants* section, we discover that the change from Glycine (G) to Glutamic acid (E) at position 338 along the protein sequence is associated to Colorblindness. +> > 2. In the structure viewer, we can move the molecule around and over the mouse on the structure to find the AA in position 338. It might take some time to follow through the multiple helical arrangements of this structures. The Glycine at 338 it's not in an helix, but in what looks like a loop just before a low confidence area in the structure. +> > 3. Based on the information we collected so far, we could make an hypothesis of why this is distruptive. It is not in an helix (usually, in transmembrane proteins, helices are inserted in the membrane), hence, it is in one of the bigger domains that protrude from the membrane, in or out the cell. This mutation probably doesn't distrupt the structure in its intra-membrane segments, but rather one of the functional domain. If you want to dig deeper, you can check if this is the extra- or intra-cellular segment in the **Feature viewer**. +> > +> {: .solution} +{: .question} + +Where does the information in the structure viewer come from? + +> Search for Human OPN1LW on UniProtKB +> +> 1. Click on the download icon below the structure +> 2. Check the file that have been downloaded +> +{: .hands-on} + +This is a PDB (Protein Data Bank) file, allowing you to visualize and analyze the protein's arrangement of atoms and amino acids. + +However, there is no reference to the PDB database in the links among the *3D structure databases*. Instead, the first link refers to the AlphaFoldDB. The AlphaFold Database is a comprehensive resource that provides predicted 3D structures for a wide range of proteins. Using deep learning techniques and evolutionary information, AlphaFold accurately predicts the spatial arrangement of atoms within a protein, contributing to our understanding of protein function and interactions. + +Hence, this is a *prediction* of the structure, not a structure experimentally validated. This is the reason why it is colored by confidence: the sections in blue are those with an high confidence value, so the ones for which the prediction is very reliable, while the ones in orange are less reilable or have a disordered (more flexible and mobile) structure. Nevertheless, this information is represented through a PDB file, because it is still structural. + +### Family and Domains + +The *Family and Domains* section on the UniProtKB entry page provides a comprehensive view of the evolutionary relationships and functional domains within a protein. This section offers insights into the protein's membership in protein families, superfamilies, and domains, shedding light on its structural and functional characteristics. + +The *Features* area confirms indeed that at least one of the two domains protruding from the membrane (the N-terminal one) is disordered. This area usually includes information about conserved regions, motifs, and important sequence features that contribute to the protein's role in various biological processes. The section confirms once again that we are looking at a transmembrane protein, and links to several resources of phylogenetic, protein families or domains data - guiding us on understading how proteins share common ancestry, evolve, and acquire specialized functions. + +### Sequence + +All this information about the protein's evolution, function, structure, is ultimately encoded in its sequence. Once again, in this section we have the opportunity to download the FASTA file transcribing it, as well as to access the source of this data: the genomic sequencing experiments that assessed it. This section also reports when isoforms have been detected. + +> +> +> How many potential isoforms are mapped to this entry? +> +> > +> > +> > 1: H0Y622 +> > +> {: .solution} +{: .question} + +### Similar proteins + +The last section of the UniProt Entry page reports similar proteins (this is basically the result of a clustering, with 100%, 90% and 50% identity thresholds). + +> +> +> 1. How many similar proteins at 100% identity? +> 2. How many similar proteins at 90% identity? +> 3. How many similar proteins at 50% identity? +> +> > +> > +> > 1. 0 +> > 2. 83 +> > 3. 397 +> > +> {: .solution} +{: .question} + +As you might have guessed while looking at this page, a lot of the processing of biological data about a protein actually consists of mapping different types of information along the sequence, and understand how they influence each other. A visual mapping (and a table with the same information) is provided by the two alternative tabs to view this entry, that is the *Variant viewer* and the *Feature viewer*. + +## Variant viewer + +> Variant viewer +> +> 1. Click on *Variant viewer* tab +> +{: .hands-on} + +The *Variant viewer* maps all the known alternative versions of this sequence. For some of them the effect (pathogenic or benign) is known, for others not. + +> +> +> How many variant are likely pathogenic? +> +> > +> > +> > By zooming out in the variant view, we see that we have 5 red points so 5 variants that are likely pathogenic +> > +> {: .solution} +{: .question} + +The high number of variants that you find in this section suggests that "protein sequences" (as well as gene sequences, protein structures etc.) are actually less fixed entities than what we could think. + +## Feature viewer + +> Feature viewer +> +> 1. Click on *Feature viewer* tab +> +{: .hands-on} + +The *Feature viewer* is basically a merged version of all the *Features* areas that we found in the *Entry* page, including *Domains & sites*, *Molecule processing*, *PTMs*, *Topology*, *Proteomics*, *Variants*. If in the viewer you click on any feature, the correspondent region in the structure will be focused, as the variant of interest + +> Variant viewer +> +> 1. Expand the *Variants* part +> 2. Zoom out +> 3. Click on our variant of interest (the red point at position 338) +> +{: .hands-on} + +> +> +> What is the topology at this location? +> +> > +> > +> > A topological cytoplasmic domain +> > +> {: .solution} +{: .question} + +Finally, let's have a quick look at the other tabs. + +## Publications + +> Publication +> +> 1. Click on *Publication* tab +> +{: .hands-on} + +The *Publications* one lists scientific publications related to the protein. These are collected by merging a fully curated list in UniProtKB/Swiss-Prot and automatically imported ones. In this tab, you can filter the publications list by source and categories that are based on the type of data a publication contains about the protein (such as function, interaction, sequence, etc.), or by the number of proteins in the corresponding study it describes ("small scale" vs "large scale"). + +> +> +> 1. How many publications are associated with this protein? +> 2. How many publications contains information about its function? +> +> > +> > +> > 1. 57 +> > 2. 23 +> > +> {: .solution} +{: .question} + +## External links + +> External links +> +> 1. Click on *External links* tab +> +{: .hands-on} + +The *External links* tab puts together all the references to external databases and resources of information that we found in each Entry page section. The links text often reports the unique identifiers that represent the same biological entity in other databases. To get a feeling of this compexity, check the following image (that is already partially outdated). + +![A graph representing how all the different databases are connected by unique IDs, where major nodes are DBs, and arrows connect them to the IDs (minor nodes) that they report. The map is very crowded, especially around the UniProt Entry Name, Gene ID and Ensembl Gene ID](./images/complexDB.jpeg "Best illustration of the complex ID crossref: bioDBnet Network Diagram - [source](https://biodbnet-abcc.ncifcrf.gov/dbInfo/netGraph.php)") + +## History + +Finally, the *History* tab is also an interesting one. It reports and makes available for download all the previous versions of this entry annotations, that is: all the "evolution" of its annotation, in this case dating back to 1988. + +> +> +> Was this entry ever not manually annotated? +> +> > +> > +> > To answer this question you can scroll back in time through the table and check the `Database` column. Was this ever in TrEMBL instead that in SwissProt? No, so this entry was manually annotated since its beginning. +> > +> {: .solution} +{: .question}