loculus-project · corneliusroemer · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024
diff --git a/.codespellrc b/.codespellrc
@@ -0,0 +1,4 @@
+[codespell]
+skip = */node_modules,*.jsonl,*.tsv,*.json,*/dist_keycloak,*/playwright-report,*.properties,*/dist,./backend/build,yarn.lock,i18n.ts
+exclude-file = .config/codespell/exclude-file
+ignore-words = .config/codespell/ignore-words
diff --git a/.config/codespell/exclude-file b/.config/codespell/exclude-file
@@ -0,0 +1 @@
+import com.fasterxml.jackson.databind.ser.std.StdSerializer
diff --git a/.config/codespell/ignore-words b/.config/codespell/ignore-words
@@ -0,0 +1,21 @@
+# not clear mistakes
+unsecure
+re-use
+
+# common false positives
+Nam
+
+## abbreviations
+ser
+
+## Camel case variable
+afterAll
+assertIn
+
+## Slang
+AAs
+nd
+
+## Library name
+redundent
+
diff --git a/architecture_docs/03_context_and_scope.md b/architecture_docs/03_context_and_scope.md
@@ -8,4 +8,4 @@ All external participants are listed in the diagram below:
 We provide instructions how to install Loculus, and we host instances ourselves,
 but other maintainers can host their own instances as well.
 We offer guidance and documentation and are open for feature requests, but we do not provide direct support for custom instances.
-We cannot forsee all possible configurations and environments that Loculus might be deployed in.
+We cannot foresee all possible configurations and environments that Loculus might be deployed in.
diff --git a/backend/README.md b/backend/README.md
@@ -69,7 +69,7 @@ You need to set:
 
 We use Flyway, so that the service can provision an empty/existing DB without any manual steps in between. On startup scripts in `src/main/resources/db/migration` are executed in order, i.e. `V1__*.sql` before `V2__*.sql` if they didn't run before, so that the DB is always up-to-date. (For more info on the naming convention, see [this](https://www.red-gate.com/blog/database-devops/flyway-naming-patterns-matter) blog post.)
 
-Note: When using a postgresSQL development platform (e.g. pgAdmin) the hostname is 127.0.0.1 and not localhost - this is defined in the `deploy.py` file.
+Note: When using a PostgreSQL development platform (e.g. pgAdmin) the hostname is 127.0.0.1 and not localhost - this is defined in the `deploy.py` file.
 
 Note that we also use flyway in the ena-submission pod to create an additional schema in the database, ena-submission. This schema is not added here.
 

diff --git a/backend/src/main/kotlin/org/loculus/backend/service/crossref/CrossRefService.kt b/backend/src/main/kotlin/org/loculus/backend/service/crossref/CrossRefService.kt
@@ -77,9 +77,9 @@ class CrossRefService(final val properties: CrossRefServiceProperties) {
             )
 
             "head" {
-                // The doi_batch_id gets ignored and the actual one is assigned after the equest is processed through
+                // The doi_batch_id gets ignored and the actual one is assigned after the request is processed through
                 // CrossRef's queue. Because of this, presumably, the doi_batch_id is not sent back when a request to
-                // the service is successful. For this, one would have to query the equest queue and retrieve it from there
+                // the service is successful. For this, one would have to query the request queue and retrieve it from there
                 "doi_batch_id" { -doiBatchID }
                 "timestamp" { -date.atStartOfDay(ZoneId.of("UTC")).toInstant().toEpochMilli().toString() }
                 "depositor" {
@@ -167,7 +167,7 @@ class CrossRefService(final val properties: CrossRefServiceProperties) {
         formData.forEach { (key, value) ->
             if (value is String) {
                 // Both carriage return and new line characters have to be sent ("\r\n"),
-                // otherwise the request witll cause a 500 error on CrossRef's end
+                // otherwise the request will cause a 500 error on CrossRef's end
                 printWriter.append("--$boundary").append("\r\n")
                 printWriter.append(
                     "Content-Disposition: form-data; name=\"${URLEncoder.encode(key, "UTF-8")}\"",

diff --git a/docs/src/content/docs/introduction/system-overview.md b/docs/src/content/docs/introduction/system-overview.md
@@ -14,7 +14,7 @@ Loculus has a modular architecture and consists of several sub-services:
 -   **SILO(s):** [SILO](https://github.com/GenSpectrum/LAPIS-SILO) is an open-source query engine for genetic sequences optimized for high performance and supporting alignment-specific queries such as mutation searches. It regularly pulls data from the backend server and indexes them. By default, SILO is not exposed to the users but accessed via LAPIS. For each [organism](../glossary#organism) of a Loculus [instance](../glossary#instance), there is a separate instance of SILO.
 -   **LAPIS(es):** [LAPIS](https://github.com/GenSpectrum/LAPIS) provides a convenient interface to SILO, offering a lightweight web API and additional data and compression formats. For each SILO instance, there is a corresponding LAPIS instance.
 -   **Website:** The frontend application of Loculus accesses the APIs of the backend server and LAPIS. It uses the backend server for everything related to data submission and LAPIS for searching and downloading released data. For logins and registrations, users are redirected to Keycloak.
--   **Preprocessing pipeline(s):** A preprocessing pipeline fetches [unprocessed/user-submitted data](../glossary#unprocessed-data) from the backend server, processes them (which usually includes cleaning, alignment and adding annotations), and sends [processed data](../glossary#processed-data) back to the backend server. The pipeline contains [organism](../glossary#organism)-specific logic, thus, there is a separate pipeline for each organism. We maintain a customizeable preprocessing pipeline that uses [Nextclade](https://github.com/nextstrain/nextclade) for alignment, quality checks and annotations but it is easy to write a new one by following the [preprocessing pipeline specifications](https://github.com/loculus-project/loculus/blob/main/preprocessing/specification.md).
+-   **Preprocessing pipeline(s):** A preprocessing pipeline fetches [unprocessed/user-submitted data](../glossary#unprocessed-data) from the backend server, processes them (which usually includes cleaning, alignment and adding annotations), and sends [processed data](../glossary#processed-data) back to the backend server. The pipeline contains [organism](../glossary#organism)-specific logic, thus, there is a separate pipeline for each organism. We maintain a customizable preprocessing pipeline that uses [Nextclade](https://github.com/nextstrain/nextclade) for alignment, quality checks and annotations but it is easy to write a new one by following the [preprocessing pipeline specifications](https://github.com/loculus-project/loculus/blob/main/preprocessing/specification.md).
 
 ![Architecture overview](./architectureOverview.svg)
 

diff --git a/ena-submission/ENA_submission.md b/ena-submission/ENA_submission.md
@@ -51,7 +51,7 @@ The following could be implement as post-MVP features:
 
 ## Submission process
 
-## 1. Registering a study programatically
+## 1. Registering a study programmatically
 
 1. Create the study XML ([schema](https://ftp.ebi.ac.uk/pub/databases/ena/doc/xsd/sra_1_5/ENA.project.xsd)):
 
@@ -123,7 +123,7 @@ The following could be implement as post-MVP features:
 
    This is where one gets the accession from.
 
-## 2. Registering a sample programatically
+## 2. Registering a sample programmatically
 
 [Docs](https://ena-docs.readthedocs.io/en/latest/submit/samples.html)
 
@@ -347,7 +347,7 @@ Instead you should use the GCA accession. These are distributed by NCBI (this is
 
 ### What would the end-to-end flow of submitting sequences for pathoplexus look like?
 
-1. [Register study programatically](https://ena-docs.readthedocs.io/en/latest/submit/study/programmatic.html)
+1. [Register study programmatically](https://ena-docs.readthedocs.io/en/latest/submit/study/programmatic.html)
 2. Upload sequences using what route? Which files are needed?
 
 ### What information do we give to the original submitter?

diff --git a/ena-submission/src/ena_deposition/ena_submission_helper.py b/ena-submission/src/ena_deposition/ena_submission_helper.py
@@ -267,7 +267,7 @@ def post_webin(config: ENAConfig, xml: dict[str, Any]) -> requests.Response:
         config.ena_submission_url,
         auth=HTTPBasicAuth(config.ena_submission_username, config.ena_submission_password),
         files=xml,
-        timeout=10,  # wait a full 10 seconds for a response incase slow
+        timeout=10,  # wait a full 10 seconds for a response in case slow
     )
 
 
@@ -551,7 +551,7 @@ def get_ena_analysis_process(
         response = requests.get(
             url,
             auth=HTTPBasicAuth(config.ena_submission_username, config.ena_submission_password),
-            timeout=10,  # wait a full 10 seconds for a response incase slow
+            timeout=10,  # wait a full 10 seconds for a response in case slow
         )
     except requests.exceptions.RequestException as e:
         error_message = f"Request failed with exception: {e}."

diff --git a/ingest/README.md b/ingest/README.md
@@ -16,7 +16,7 @@ snakemake --dag | dot -Tpng > static/dag.png
 
 ### Download data from NCBI virus
 
-Using NCBI `datasets` CLI, download all sequences and corresponding NCBI curated metadata for a configurable taxon. The taxon is specified using the NCBI Taxonomy ID, and includes all child taxa, i.e. dowloading sequences for the Ebola virus taxon ID includes all sequences for more specific Ebola virus (sub)species taxon ids.
+Using NCBI `datasets` CLI, download all sequences and corresponding NCBI curated metadata for a configurable taxon. The taxon is specified using the NCBI Taxonomy ID, and includes all child taxa, i.e. downloading sequences for the Ebola virus taxon ID includes all sequences for more specific Ebola virus (sub)species taxon ids.
 
 Sequences and metadata are transformed into (nd)json files to simplify (de)serialization and further processing.
 

diff --git a/keycloak/keycloakify/vite.config.ts b/keycloak/keycloakify/vite.config.ts
@@ -42,7 +42,7 @@ export default defineConfig({
   /*
    * Uncomment this if you want to use the default domain provided by GitHub Pages
    * replace "keycloakify-starter" with your repository name.
-   * This is only relevent if you are building an Wep App + A Keycloak theme.
+   * This is only relevant if you are building an Wep App + A Keycloak theme.
    * If you are only building a Keycloak theme, you can ignore this.
    */
   //base: "/keycloakify-starter/"

diff --git a/kubernetes/loculus/CONTRIBUTING.md b/kubernetes/loculus/CONTRIBUTING.md
@@ -13,7 +13,7 @@ helm template loculus kubernetes/loculus \
 
 ## Diffing produced Kubernetes manifests
 
-To diff produced manifests, you can use the `diff` command and to specifically compare metadata fields you can use the `kuberentes/loculus/utils/yamldiff_script.py` script.
+To diff produced manifests, you can use the `diff` command and to specifically compare metadata fields you can use the `kubernetes/loculus/utils/yamldiff_script.py` script.
 
 1. Install yamldiff: `go install github.com/sahilm/yamldiff@latest`
 2. Create the manifests to diff: `helm template loculus kubernetes/loculus > /tmp/new.yaml`

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -821,14 +821,14 @@ defaultOrganismConfig: &defaultOrganismConfig
       - name: sequencedByContactName
         ontology_id: GENEPIO:0100471
         definition: The name or title of the contact responsible for follow-up regarding the sequence.
-        guidance: Provide the name of an individual or their job title. As personnel turnover may render the contact's name obsolete, it is more prefereable to provide a job title for ensuring accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
+        guidance: Provide the name of an individual or their job title. As personnel turnover may render the contact's name obsolete, it is more preferable to provide a job title for ensuring accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
         example: Enterics Lab Manager
         displayName: Sequenced by - contact name
         header: Sequencing
       - name: sequencedByContactEmail
         ontology_id: GENEPIO:0100422
         definition: The email address of the contact responsible for follow-up regarding the sequence.
-        guidance: Provide the email associated with the listed contact. As personnel turnover may render an individual's email obsolete, it is more prefereable to provide an address for a position or lab, to ensure accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
+        guidance: Provide the email associated with the listed contact. As personnel turnover may render an individual's email obsolete, it is more preferable to provide an address for a position or lab, to ensure accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
         example: [email protected]
         displayName: Sequenced by - contact email
         header: Sequencing

diff --git a/website/src/components/SeqSetCitations/SeqSetForm.tsx b/website/src/components/SeqSetCitations/SeqSetForm.tsx
@@ -117,7 +117,7 @@ export const SeqSetForm: FC<SeqSetFormProps> = ({ clientConfig, accessToken, edi
                         htmlFor={`loculus-${isFocalStr}-accession-input`}
                         className='block mb-2 text-sm font-medium text-gray-900 dark:text-white'
                     >
-                        {`${isFocal === true ? '* ' : ''}${capitalCase(isFocalStr)} accessions (seperated by comma or whitespace)`}
+                        {`${isFocal === true ? '* ' : ''}${capitalCase(isFocalStr)} accessions (separated by comma or whitespace)`}
                     </label>
                     <textarea
                         id={`loculus-${isFocalStr}-accession-input`}

diff --git a/website/src/services/lapisClient.ts b/website/src/services/lapisClient.ts
@@ -60,7 +60,7 @@ export class LapisClient extends ZodiosWrapperClient<typeof lapisApi> {
             dataFormat: 'TSV',
         });
         // This type cast isn't pretty, but if the API would be typed correctly, the union type
-        // of the actual details resonse and the potential 'string' would pollute the whole API,
+        // of the actual details response and the potential 'string' would pollute the whole API,
         // so I (@fhennig) decided to just do this cast here. We know that the return value is a TSV string.
         return result.map((data) => data as unknown as string);
     }