diff --git a/_data/CONTRIBUTORS.yaml b/_data/CONTRIBUTORS.yaml index dc12cc2c..18542077 100644 --- a/_data/CONTRIBUTORS.yaml +++ b/_data/CONTRIBUTORS.yaml @@ -11,187 +11,199 @@ # image_url: absolute path to image (default image from github) Rudolf Wittner: - git: RudoWittner - email: rudolf.wittner@bbmri-eric.eu - orcid: 0000-0002-0003-2024 - role: editor - affiliation: BBMRI-ERIC + git: RudoWittner + email: rudolf.wittner@bbmri-eric.eu + orcid: 0000-0002-0003-2024 + role: editor + affiliation: BBMRI-ERIC Bert Droesbeke: - git: bedroesb - email: bedro@psb.ugent.be - orcid: 0000-0003-0522-5674 - role: editor - affiliation: VIB-UGent / ELIXIR-BE + git: bedroesb + email: bedro@psb.ugent.be + orcid: 0000-0003-0522-5674 + role: editor + affiliation: VIB-UGent / ELIXIR-BE Hedi Peterson: - git: hedi-ee - email: hedi.peterson@gmail.com - orcid: 0000-0001-9951-5116 - role: editor - affiliation: UTARTU / ELIXIR-EE + git: hedi-ee + email: hedi.peterson@gmail.com + orcid: 0000-0001-9951-5116 + role: editor + affiliation: UTARTU / ELIXIR-EE Stian Soiland-Reyes: - git: stain - email: soiland-reyes@manchester.ac.uk - orcid: 0000-0001-9842-9718 - role: editor - affiliation: The University of Manchester / ELIXIR-UK + git: stain + email: soiland-reyes@manchester.ac.uk + orcid: 0000-0001-9842-9718 + role: editor + affiliation: The University of Manchester / ELIXIR-UK Patricia Palagi: - git: ppalagi - email: patricia.palagi@sib.swiss - orcid: 0000-0001-9062-6303 - role: editor - affiliation: SIB Swiss Institute of Bioinformatics / ELIXIR-CH + git: ppalagi + email: patricia.palagi@sib.swiss + orcid: 0000-0001-9062-6303 + role: editor + affiliation: SIB Swiss Institute of Bioinformatics / ELIXIR-CH Liane Hughes: - git: LianeHughes - email: liane.hughes@scilifelab.uu.se - orcid: 0000-0002-4784-5436 - role: editor - affiliation: SciLifeLab / Uppsala University + git: LianeHughes + email: liane.hughes@scilifelab.uu.se + orcid: 0000-0002-4784-5436 + role: editor + affiliation: SciLifeLab / Uppsala University Eva Garcia Alvarez: - git: EvaGarciaAlvarez - email: eva.garcia-alvarez@bbmri-eric.eu - orcid: 0000-0002-3522-5088 - role: editor - affiliation: BBMRI-ERIC + git: EvaGarciaAlvarez + email: eva.garcia-alvarez@bbmri-eric.eu + orcid: 0000-0002-3522-5088 + role: editor + affiliation: BBMRI-ERIC Laura Portell Silva: - git: lauportell - email: laura.portell@bsc.es - orcid: 0000-0003-0390-3208 - role: editor - affiliation: Barcelona Supercomputing Center / ELIXIR-ES + git: lauportell + email: laura.portell@bsc.es + orcid: 0000-0003-0390-3208 + role: editor + affiliation: Barcelona Supercomputing Center / ELIXIR-ES Rafael Andrade Buono: - orcid: 0000-0002-6675-3836 - git: rabuono - affiliation: VIB-UGent / ELIXIR-BE - role: editor + orcid: 0000-0002-6675-3836 + git: rabuono + affiliation: VIB-UGent / ELIXIR-BE + role: editor Shona Cosgrove: - orcid: 0000-0003-3273-9013 - git: ShonaCosgrove - email: shona.cosgrove@sciensano.be - affiliation: Sciensano - role: editor + orcid: 0000-0003-3273-9013 + git: ShonaCosgrove + email: shona.cosgrove@sciensano.be + affiliation: Sciensano + role: editor Iris Van Dam: - orcid: 0000-0002-3965-6184 - git: IrisVanDam - email: iris.vandam@sciensano.be - affiliation: Sciensano - role: editor + orcid: 0000-0002-3965-6184 + git: IrisVanDam + email: iris.vandam@sciensano.be + affiliation: Sciensano + role: editor Katharina Lauer: - orcid: 0000-0002-4347-7525 - git: klauer2207 - email: katharina@airfinity.com - affiliation: Airfinity Ltd. - role: editor + orcid: 0000-0002-4347-7525 + git: klauer2207 + email: katharina@airfinity.com + affiliation: Airfinity Ltd. + role: editor Romain David: - orcid: 0000-0003-4073-7456 - git: RomainDavid24 - email: romain.david@erinha.eu - affiliation: ERINHA - role: editor + orcid: 0000-0003-4073-7456 + git: RomainDavid24 + email: romain.david@erinha.eu + affiliation: ERINHA + role: editor Simone Leo: - orcid: 0000-0001-8271-5429 - git: simleo - affiliation: CRS4, IT + orcid: 0000-0001-8271-5429 + git: simleo + affiliation: CRS4, IT Martin Cook: - git: martin-nc - email: martin.cook@elixir-europe.org - role: editor - affiliation: ELIXIR Hub + git: martin-nc + email: martin.cook@elixir-europe.org + role: editor + affiliation: ELIXIR Hub Erin Calhoun: - orcid: 0009-0003-3752-7156 - git: erincalhoun - email: erin.calhoun@uit.no - role: - affiliation: ELIXIR Norway + orcid: 0009-0003-3752-7156 + git: erincalhoun + email: erin.calhoun@uit.no + role: + affiliation: ELIXIR Norway Nazeefa Fatima: - orcid: 0000-0001-7791-4984 - git: Nazeeefa - email: nazeefaf@uio.no - role: - affiliation: ELIXIR Norway + orcid: 0000-0001-7791-4984 + git: Nazeeefa + email: nazeefaf@uio.no + role: + affiliation: ELIXIR Norway Aina Jené Cortada: - git: ainajene - email: aina.jene@crg.eu - orcid: 0000-0001-7721-7097 - affiliation: European Genome-phenome Archive (EGA) / CRG + git: ainajene + email: aina.jene@crg.eu + orcid: 0000-0001-7721-7097 + affiliation: European Genome-phenome Archive (EGA) / CRG Marcos Casado Barbero: - git: M-casado - email: mcasado@ebi.ac.uk - orcid: 0000-0002-7747-6256 - affiliation: European Genome-phenome Archive (EGA) / EMBL-EBI + git: M-casado + email: mcasado@ebi.ac.uk + orcid: 0000-0002-7747-6256 + affiliation: European Genome-phenome Archive (EGA) / EMBL-EBI Arnau Soler Costa: - git: solcos - email: arnau.soler@crg.eu - orcid: 0000-0002-2853-6742 - affiliation: European Genome-phenome Archive (EGA) / CRG + git: solcos + email: arnau.soler@crg.eu + orcid: 0000-0002-2853-6742 + affiliation: European Genome-phenome Archive (EGA) / CRG Ruben Brondeel: - orcid: 0000-0002-9876-4150 - git: RubenBrondeel - affiliation: Sciensano + orcid: 0000-0002-9876-4150 + git: RubenBrondeel + affiliation: Sciensano Koen Blot: - orcid: 0000-0002-0847-0133 - affiliation: Sciensano + orcid: 0000-0002-0847-0133 + affiliation: Sciensano Nina Van Goethem: - orcid: 0000-0001-7316-6990 - affiliation: Sciensano + orcid: 0000-0001-7316-6990 + affiliation: Sciensano Miriam Saso: - orcid: 0000-0002-1888-1913 - affiliation: Sciensano + orcid: 0000-0002-1888-1913 + affiliation: Sciensano Wolfgang Maier: - git: wm75 - email: maierw@informatik.uni-freiburg.de - orcid: 0000-0002-9464-6640 - affiliation: University of Freiburg / ELIXIR-DE -Anna Asklöf: - git: annaasklof - affiliation: SciLifeLab / Uppsala University + git: wm75 + email: maierw@informatik.uni-freiburg.de + orcid: 0000-0002-9464-6640 + affiliation: University of Freiburg / ELIXIR-DE +Anna Asklöf: + git: annaasklof + affiliation: SciLifeLab / Uppsala University Katarina Öjefors Stark: - orcid: 0000-0001-7970-7778 - git: KatarinaOjeforsStark - affiliation: SciLifeLab / Uppsala University + orcid: 0000-0001-7970-7778 + git: KatarinaOjeforsStark + affiliation: SciLifeLab / Uppsala University Diana Pilvar: - git: diana-pilvar - email: diana.pilvar@ut.ee - orcid: 0000-0002-5788-2687 - affiliation: University of Tartu / ELIXIR-Estonia + git: diana-pilvar + email: diana.pilvar@ut.ee + orcid: 0000-0002-5788-2687 + affiliation: University of Tartu / ELIXIR-Estonia Robin Navest: - orcid: 0000-0002-0152-2092 - git: rnavest - affiliation: Lygature + orcid: 0000-0002-0152-2092 + git: rnavest + affiliation: Lygature Julia Lischke: - orcid: 0000-0002-5524-2838 - git: - affiliation: Lygature + orcid: 0000-0002-5524-2838 + git: + affiliation: Lygature Jan-Willem Boiten: - orcid: 0000-0003-0327-638X - git: + orcid: 0000-0003-0327-638X + git: Jeroen Belien: - orcid: 0000-0002-7160-5942 - git: jambelien - affiliation: Health-RI + orcid: 0000-0002-7160-5942 + git: jambelien + affiliation: Health-RI Clara Amid: - orcid: 0000-0001-6534-7425 - git: - affiliation: Erasmus Medical Center + orcid: 0000-0001-6534-7425 + git: + affiliation: Erasmus Medical Center Angelica Maineri: - orcid: 0000-0002-6978-5278 - git: AngelicaMaineri - affiliation: ODISSEI + orcid: 0000-0002-6978-5278 + git: AngelicaMaineri + affiliation: ODISSEI Ricarda Braukmann: - orcid: 0000-0001-6383-7148 - git: Ruecue - affiliation: DANS + orcid: 0000-0001-6383-7148 + git: Ruecue + affiliation: DANS Simon Saldner: - git: ssaldner - email: simon.saldner@dans.knaw.nl - affiliation: DANS-KNAW - orcid: 0000-0002-1145-7829 + git: ssaldner + email: simon.saldner@dans.knaw.nl + affiliation: DANS-KNAW + orcid: 0000-0002-1145-7829 Dimitra Kondyli: - email: dkondyli@ekke.gr - affiliation: EKKE + email: dkondyli@ekke.gr + affiliation: EKKE Markus Tuominen: - email: markus.tuominen@tuni.fi - affiliation: CESSDA ERIC / TAU-FSD + email: markus.tuominen@tuni.fi + affiliation: CESSDA ERIC / TAU-FSD Vasso Kalaitzi: - email: vasso.kalaitzi@dans.knaw.nl - affiliation: DANS-KNAW - orcid: 0000-0001-8337-120X + email: vasso.kalaitzi@dans.knaw.nl + affiliation: DANS-KNAW + orcid: 0000-0001-8337-120X +Wolmar Nyberg Åkerström: + email: vasso.kalaitzi@dans.knaw.nl + affiliation: DANS-KNAW + orcid: 0000-0001-8337-120X +Zahra Waheed: + email: vasso.kalaitzi@dans.knaw.nl + affiliation: DANS-KNAW + orcid: 0000-0001-8337-120X +Flora D'Anna: + email: vasso.kalaitzi@dans.knaw.nl + affiliation: DANS-KNAW + orcid: 0000-0001-8337-120X diff --git a/_data/tool_and_resource_list.yml b/_data/tool_and_resource_list.yml index 51a0ef12..e168f153 100644 --- a/_data/tool_and_resource_list.yml +++ b/_data/tool_and_resource_list.yml @@ -204,8 +204,9 @@ id: dragen-gatk name: Dragen-GATK url: https://gatk.broadinstitute.org/hc/en-us/articles/360045944831 -- description: 'Dryad is an open-source, community-led data curation, publishing, and preservation platform for CC0 publicly available research data. Dryad has a long-term data preservation strategy, and is a Core Trust Seal Certified Merritt repository with storage in US and EU at the San Diego Supercomputing Center, DANS, and Zenodo. While data is undergoing peer review, it is embargoed if the related journal requires / allows this. Dryad is an independent non-profit that works directly with: researchers to publish datasets utilising best practices for discovery and reuse; publishers to support the integration of data availability statements and data citations into their workflows; and institutions to enable scalable campus support for research data management best practices at low cost. Costs are covered by institutional, publisher, and funder members, otherwise a one-time fee of $120 for authors to cover cost of curation and preservation. Dryad also receives direct funder support through - grants.' +- description: + "Dryad is an open-source, community-led data curation, publishing, and preservation platform for CC0 publicly available research data. Dryad has a long-term data preservation strategy, and is a Core Trust Seal Certified Merritt repository with storage in US and EU at the San Diego Supercomputing Center, DANS, and Zenodo. While data is undergoing peer review, it is embargoed if the related journal requires / allows this. Dryad is an independent non-profit that works directly with: researchers to publish datasets utilising best practices for discovery and reuse; publishers to support the integration of data availability statements and data citations into their workflows; and institutions to enable scalable campus support for research data management best practices at low cost. Costs are covered by institutional, publisher, and funder members, otherwise a one-time fee of $120 for authors to cover cost of curation and preservation. Dryad also receives direct funder support through + grants." id: dryad name: Dryad registry: @@ -232,7 +233,7 @@ fairsharing: mya1ff tess: European Genome-phenome Archive (EGA) url: https://ega-archive.org/ -- description: 'The European Language Social Science Thesaurus (ELSST) is a broad-based, multilingual thesaurus for the social sciences. It is owned and published by the Consortium of European Social Science Data Archives (CESSDA) and its national Service Providers. The thesaurus consists of over 3,300 concepts and covers the core social science disciplines: politics, sociology, economics, education, law, crime, demography, health, employment, information, communication technology, and environmental science. ELSST is used for data discovery within CESSDA and facilitates access to data resources across Europe, independent of domain, resource, language, or vocabulary. ELSST is currently available in 16 languages: Danish, Dutch, Czech, English, Finnish, French, German, Greek, Hungarian, Icelandic, Lithuanian, Norwegian, Romanian, Slovenian, Spanish, and Swedish' +- description: "The European Language Social Science Thesaurus (ELSST) is a broad-based, multilingual thesaurus for the social sciences. It is owned and published by the Consortium of European Social Science Data Archives (CESSDA) and its national Service Providers. The thesaurus consists of over 3,300 concepts and covers the core social science disciplines: politics, sociology, economics, education, law, crime, demography, health, employment, information, communication technology, and environmental science. ELSST is used for data discovery within CESSDA and facilitates access to data resources across Europe, independent of domain, resource, language, or vocabulary. ELSST is currently available in 16 languages: Danish, Dutch, Czech, English, Finnish, French, German, Greek, Hungarian, Icelandic, Lithuanian, Norwegian, Romanian, Slovenian, Spanish, and Swedish" id: european-language-social-science-thesaurus name: European Language Social Science Thesaurus (ELSST) registry: @@ -360,7 +361,7 @@ id: gitlab name: GitLab registry: - fairsharing: '530e61' + fairsharing: 530e61 tess: GitLab url: https://about.gitlab.com/ - description: GO is to perform enrichment analysis on gene sets. @@ -729,3 +730,9 @@ id: sars-cov2-data-hubs name: SARS-CoV-2 Data Hubs url: https://www.covid19dataportal.org/data-hubs +- description: A web-based platform for sharing viral sequence data, initially for influenza data, and now for other pathogens (including SARS-CoV-2). + name: Global Initiative on Sharing All Influenza Data (GISAID) + registry: + fairsharing: 2f7f9f + id: gisaid + url: https://gisaid.org diff --git a/pathogen-characterisation/data-description.md b/pathogen-characterisation/data-description.md index 8752b3fd..0b6f0290 100644 --- a/pathogen-characterisation/data-description.md +++ b/pathogen-characterisation/data-description.md @@ -1,13 +1,24 @@ --- title: Data description description: Finding (meta)data standards and documentation -contributors: [] -no_robots: true +contributors: + [ + Wolmar Nyberg Åkerström, + Zahra Waheed, + Liane Hughes, + Flora D'Anna, + Patricia Palagi, + Wolfgang Maier, + Diana Pilvar, + Rafael Andrade Buono, + ] page_id: pc_data_description rdmkit: - - name: - url: -related_pages: + - name: "Your tasks: Documentation and metadata" + url: https://rdmkit.elixir-europe.org/metadata_management + - name: "Human Pathogen Genomics" + url: https://rdmkit.elixir-europe.org/human_pathogen_genomics +related_pages: showcase: [] human_biomolecular_data: [] human_clinical_and_health_data: [] @@ -20,10 +31,117 @@ training: # More information on how to fill in this metadata section can be found here https://www.infectious-diseases-toolkit.org/contribute/page-metadata --- -**We are still working on the content for this page.** If you are interested in adding to the page, then: +## Introduction + +Ensuring that data is adequately described is crucial to enable others to be able to reuse it or reproduce work. For general information on the importance of data description, please see [the documentation and metadata section of RDMkit](https://rdmkit.elixir-europe.org/metadata_management). + +## Good practices for describing data in general + +You should always plan your study with [data sharing](https://rdmkit.elixir-europe.org/data_publication) in mind. Identifying the information necessary for sharing your data effectively will allow you to generate documentation that is both well structured and complete. Describing the data entails extracting the information, recording it according to existing best practice guidelines and standards, and extending descriptions where needed. + +### Considerations + +When defining a general data description strategy, it is important to consult relevant guidelines and adopt accepted standards. + +- **Include enough detail to inform assessment by your peers and the wider research community.** + The way that you describe and document your data will vary depending on the type of study. The descriptions should include all of the information necessary for others to be able to understand and process the data that you share. It should also include sufficient information to enable an informed assessment of the reliability of the data and the comptence with which they were generated. The descriptions are usually organised following research community guidelines and then structured as metadata to make the data FAIR. +- **Review the submission guidelines of the repositories that you intend to submit data into.** Different repositories often require different levels/types of metadata, and restrict access to the data to different extents. This is largely linked to the fact that different repositories have different aims/audiences, and will therefore enable users to search different parts of the metadata. It is important to check the requested data formats and the checklists specified by the repositories. We encourage you to go beyond just the minimum data required by the repository, as this will likely increase the reusability of your data in different contexts. Contact the repositories via their helpdesks if you have any questions about whether you are able to include different types of metadata. +- **Adopt shared practices for annotating experiments.** Some research communities (e.g. ECDC) have developed guidelines detailing shared/common practices for describing experiments and how to embed descriptions in the data. By adopting a shared standard, your data are likely to be more accessible and understandable to potential users. At a minimum, it is generally recommended that you describe the design of the study/program, collected specimens, sample preparation steps, experimental protocols, and workflow. +- **Follow the recommendations from national and international authorities** including e.g. public health authorities, epidemic surveillance programs, and research data communities. +- **Share/reference information about cohorts, samples, instruments, protocols, materials and methods.** It is important to retain information about which protocols (including which version) were used at each step of your experiments, as well as which samples were prepared and processed together. This can include, for example, information about the model number of the instruments used, the biobanks samples used, and the suppliers of kits. This can allow users to identify potential issues and artefacts and even generate new data. +- **Share/reference data assets, software versions & computational workflows.** You should include links/references to source data used, and describe analytic workflows (with runtime quality metrics). Other information could include bioinformatics protocols used, and the versions of software and computational workflows used. This is crucial for understanding exactly what was done as well as identifying potential areas for improvement. +- **Protect the privacy of human research subjects and patients.** Whilst providing complete data descriptions is generally advisable, some data will need to be anonymised before it can be shared. You should describe how and why sensitive data were masked or removed. + +### Existing approaches + +- Refer to the {% tool "fairsharing" %} registry for standards related to repositories. For example, [the BY-COVID Data Resources FAIR Sharing collection](https://fairsharing.org/3773) includes references to implemented standards. +- Refer to [ECDC’s Infectious disease topics](https://www.ecdc.europa.eu/en/infectious-disease-topics) for active surveillance initiatives and shared reporting standards related to the disease associated with the pathogen under study. Examples of COVID-19 materials include: + - [ECDC’s Methods for the detection and identification of SARS-CoV-2 variants](https://www.ecdc.europa.eu/en/publications-data/methods-detection-and-characterisation-sars-cov-2-variants-second-update) + - [ECDC’s Surveillance and study protocols](https://www.ecdc.europa.eu/en/covid-19/surveillance/study-protocols) + - [ECDC’s TESSy reporting protocol for COVID-19](https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-Reporting-Protocol-v5.1.pdf) +- Refer to [WHO's publications](https://www.who.int/publications) to learn more about standards, tracking, and reporting for pathogens affecting global health. For example: + - [WHO’s Genomic sequencing of SARS-CoV-2: a guide to implementation for \* maximum impact on public health, 8 January 2021](https://apps.who.int/iris/handle/10665/338480), notably chapter 3 on project planning/study design and chapter 4 on data sharing guidance. + - [WHO’s Guidance for surveillance of SARS-CoV-2 variants: Interim guidance, 9 August 2021](https://www.who.int/publications/i/item/WHO_2019-nCoV_surveillance_variants) +- [The PHA4GE SARS-CoV-2 contextual data specification package](https://doi.org/10.1093%2Fgigascience%2Fgiac003) has examples of how to design data collection templates as well as practices for effective data sharing. +- Refer to guidelines of international consortia and focus groups. For example, the RDA COVID-19 Working Group's [Recommendations and Guidelines on data sharing](https://doi.org/10.15497/rda00052). Released in 2020, it includes information on adopting shared practices. +- Outline the data sharing platforms, data description standards, and checklists, in your [data management plan](https://rdmkit.elixir-europe.org/data_management_plan#what-should-you-write-in-a-dmp). + +## Biological samples + +### Considerations + +Many different types of material can be sampled to monitor and identify pathogens. + +- **Describing environmental samples.** It is important to describe and provide metadata about your samples e.g. about geolocalisation, time-zone in which sample were taken, and the meteorological conditions at the time of collection. +- **Describing samples related to human hosts.** + For biological samples from human hosts, consider that national authorities adopt and customise international recommendations. +- **Describing samples that do not originate from natural environments or hosts**. This includes cultured/lab grown samples. +- **Describing pooled samples** + +### Existing approaches + +- Descriptions of biological samples (metadata) must be provided alongside the molecular data. Therefore, repositories such as {% tool "european-nucleotide-archive" %} and {% tool "gisaid" %} have specific requirements for the description of biological samples. + - Which sample metadata should be submitted to repositories, regardless of pathogenic organism, is often debated. However, important metadata for a pathogen context is below: + - Taxon id / scientific name of sample. + - Geographic location. + - Collection date. + - Host scientific name. + - Host health state. + - Collector name. + - Collecting institution. + + + +- International and community efforts have been made to harmonise or provide unified checklists. For example, [there is a list of issues in mapping between ENA/GISAID.](https://docs.google.com/spreadsheets/d/1gNpdZKOUKPemMUHR107JRSeaWjPczlMIUp5fP5-kR9g/edit#gid=0) + +- Check WHO's guidance on metadata standards. See, for example, [their guidance for COVID-19](https://apps.who.int/iris/handle/10665/338480), notably chapter 6, table 2. + + + +## Genome data of viral pathogens + +The screening and genome reconstruction of viruses is crucial in understanding the pathenogenesis of a pathogen, and in enabling the sources of outbreaks to be traced rapidly. + +### Considerations + +Whilst it is inherently important to follow general best practices for genome data management, there are also specific considerations for this type of data: + +- **Aim for someone else to be able to replicate the experiments.** + You should adopt common practices to enable others to compile and use data generated across different research projects and public health initiatives. This includes being able to add more samples to extend the scope of a study. +- **Describe the design of the study/program and experimental variables.** + This includes, for example, the purpose of the study, the variables and observations required to reach its goals, and how the genome data fits into this context. +- **Describe how the samples were prepared for sequencing.** + This includes describing how and where specimens were collected and the process used to create the samples containing the genomic material of interest. You should also reference protocols, reagent kits, instruments, and note any key observations and choices made. +- **Describe how sequencing was performed and configured.** + This includes, for example, describing which sequencing technologoy was used and the corresponding platform, instrument models, and the library preparation protocols selected for the experiment. You should note any key observations and choices, such as the assessment of samples, multiplexing/demultiplexing approach, software settings, and other factors that strongly influence the results. +- **Describe how the data was exported and processed.** + Describing which file formats were used, what each file contains, how the different files are related to each other, and if/how some information was masked or discarded. +- **The potential differences in policies between the data repositories to which you will submit**. For example, there are differences between {% tool "gisaid" %} and {% tool "european-nucleotide-archive" %}, with {% tool "gisaid" %} having a more restrictive licence than {% tool "european-nucleotide-archive" %}. + +### Existing approaches + +- State the objective of your study. Examples of study objectives include: 'To determine the genome of an emerging pathogen', and 'phylogenetic analysis for understanding of how pathogens evolve'. +- State the type(s) of data that you have. This could include: reads, sequences (assembly, variants, consensus), alignments, and annotations. +- There is guidance on which metadata should be included for submissions to repositories (for example, {% tool "european-nucleotide-archive" %} provides both [general guidelines](https://ena-docs.readthedocs.io/en/latest/submit/general-guide.html) and [guidelines specific for pathogens](https://ena-docs.readthedocs.io/en/latest/faq/pathogen-subs-guide.html)). Refer to these guidelines to explore which metadata can be included with your submission. +- In the guidelines associated with submissions to repositories like {% tool "european-nucleotide-archive" %}, some metadata fields will be listed as mandatory, meaning that they must be included with submissions. However, in order to ensure consistency and reproducibility, it is important to include other pieces of metadata that may not be considered mandatory. Below is a list of metadata that you should include with submissions (please refer to [guidance from ENA](https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html) to clarify the terms used below): -[Feel free to contribute](/contribute/){: class="btn btn-primary btn-lg rounded-pill"} + - Collection protocol used. + - Information about genome preparation; RNA/DNA extraction protocol, amplification protocols, contaminant, and which samples were prepared together. + - Information about library preparation. + - Configuration of sequencing instrument. + - Information about pooled/multiplexed runs, including the instruments used, configurations used, whether processing was automated/manual, and demultiplexing. + - A description of how the instrument data was converted to standardised formats. + - The algorithms used in normalisation/alignment protocols. You should refer to [general best practice for sequencing projects](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008260). -This is a community-driven website, so contributions are welcome! You will, of course, be listed as a contributor on the page. +- Data files with reads produced by sequencing instruments often contain fragments of the host organism’s DNA. When the host is a human research subject or patient, these fragments must be masked or removed from the data files before they can be submitted. -New content is announced on the [home page](/) and [news page](/about/news), so please check for updates there. You can also watch for changes on this page by using a free service like [Visual Ping](https://visualping.io/) or [Distill Web Monitor](https://distill.io/), or by using a [browser add-on](https://chrome.google.com/webstore/detail/distill-web-monitor/inlikjemeeknofckkjolnjbpehgadgge?hl=en). +