Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sketch: do not hide metadata processing in sequence compression function #3241

Merged
merged 3 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.loculus.backend.service.submission

import com.fasterxml.jackson.databind.node.NullNode
import com.github.luben.zstd.Zstd
import org.loculus.backend.api.GeneticSequence
import org.loculus.backend.api.Organism
Expand Down Expand Up @@ -85,40 +84,34 @@ class CompressionService(private val backendConfig: BackendConfig) {
},
)

fun decompressProcessedData(processedData: ProcessedData<CompressedSequence>, organism: Organism) = ProcessedData(
backendConfig
.getInstanceConfig(organism)
.schema
.metadata
.map { it.name }
.associateWith { fieldName ->
processedData.metadata[fieldName] ?: NullNode.instance
},
processedData
.unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
fun decompressSequencesInProcessedData(processedData: ProcessedData<CompressedSequence>, organism: Organism) =
ProcessedData(
processedData.metadata,
processedData
.unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
when (sequenceData) {
null -> null
else -> decompressNucleotideSequence(sequenceData, segmentName, organism)
}
},
processedData.alignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
when (sequenceData) {
null -> null
else -> decompressNucleotideSequence(sequenceData, segmentName, organism)
}
},
processedData.alignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
when (sequenceData) {
null -> null
else -> decompressNucleotideSequence(sequenceData, segmentName, organism)
}
},
processedData.nucleotideInsertions,
processedData.alignedAminoAcidSequences.mapValues { (gene, sequenceData) ->
when (sequenceData) {
null -> null
else -> decompressAminoAcidSequence(sequenceData, gene, organism)
}
},
processedData.aminoAcidInsertions,
)
processedData.nucleotideInsertions,
processedData.alignedAminoAcidSequences.mapValues { (gene, sequenceData) ->
when (sequenceData) {
null -> null
else -> decompressAminoAcidSequence(sequenceData, gene, organism)
}
},
processedData.aminoAcidInsertions,
)

fun compressProcessedData(processedData: ProcessedData<String>, organism: Organism) = ProcessedData(
processedData.metadata.filterNot { (_, value) -> value.isNull },
fun compressSequencesInProcessedData(processedData: ProcessedData<String>, organism: Organism) = ProcessedData(
processedData.metadata,
processedData
.unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
when (sequenceData) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.loculus.backend.service.submission

import org.loculus.backend.api.Organism
import org.loculus.backend.api.ProcessedData
import org.springframework.stereotype.Service

@Service
class ProcessedDataPostprocessor(
private val compressionService: CompressionService,
private val processedMetadataPostprocessor: ProcessedMetadataPostprocessor,
) {
fun prepareForStorage(processedData: ProcessedData<String>, organism: Organism) = processedData
.let { compressionService.compressSequencesInProcessedData(it, organism) }
.let { processedMetadataPostprocessor.stripNullValuesFromMetadata(it) }

fun retrieveFromStoredValue(storedValue: ProcessedData<CompressedSequence>, organism: Organism) = storedValue
.let { processedMetadataPostprocessor.filterOutExtraFieldsAndAddNulls(it, organism) }
.let { compressionService.decompressSequencesInProcessedData(it, organism) }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.loculus.backend.service.submission

import com.fasterxml.jackson.databind.node.NullNode
import org.loculus.backend.api.Organism
import org.loculus.backend.api.ProcessedData
import org.loculus.backend.config.BackendConfig
import org.springframework.stereotype.Service

@Service
class ProcessedMetadataPostprocessor(private val backendConfig: BackendConfig) {
fun <SequenceType> stripNullValuesFromMetadata(processedData: ProcessedData<SequenceType>) =
processedData.copy(metadata = processedData.metadata.filterNot { (_, value) -> value.isNull })

/** Filter out any extra fields that are not in the current schema and add nulls for any missing fields. */
fun <SequenceType> filterOutExtraFieldsAndAddNulls(processedData: ProcessedData<SequenceType>, organism: Organism) =
processedData.copy(
metadata = backendConfig
.getInstanceConfig(organism)
.schema
.metadata
.map { it.name }
.associateWith { fieldName ->
processedData.metadata[fieldName] ?: NullNode.instance
},
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class SubmissionDatabaseService(
pool: DataSource,
private val emptyProcessedDataProvider: EmptyProcessedDataProvider,
private val compressionService: CompressionService,
private val processedDataPostprocessor: ProcessedDataPostprocessor,
private val auditLogger: AuditLogger,
private val dateProvider: DateProvider,
@Value("\${${BackendSpringProperty.STREAM_BATCH_SIZE}}") private val streamBatchSize: Int,
Expand Down Expand Up @@ -332,7 +333,7 @@ class SubmissionDatabaseService(
) {
it[processingStatusColumn] = PROCESSED.name
it[processedDataColumn] =
compressionService.compressProcessedData(processedData, organism)
processedDataPostprocessor.prepareForStorage(processedData, organism)
it[errorsColumn] = submittedErrors
it[warningsColumn] = submittedWarnings
it[finishedProcessingAtColumn] = dateProvider.getCurrentDateTime()
Expand Down Expand Up @@ -605,7 +606,7 @@ class SubmissionDatabaseService(
submissionId = it[SequenceEntriesView.submissionIdColumn],
processedData = when (val processedData = it[SequenceEntriesView.jointDataColumn]) {
null -> emptyProcessedDataProvider.provide(organism)
else -> compressionService.decompressProcessedData(processedData, organism)
else -> processedDataPostprocessor.retrieveFromStoredValue(processedData, organism)
},
submittedAtTimestamp = it[SequenceEntriesView.submittedAtTimestampColumn],
releasedAtTimestamp = it[SequenceEntriesView.releasedAtTimestampColumn]!!,
Expand Down Expand Up @@ -979,7 +980,7 @@ class SubmissionDatabaseService(
version = selectedSequenceEntry[SequenceEntriesView.versionColumn],
status = Status.fromString(selectedSequenceEntry[SequenceEntriesView.statusColumn]),
groupId = selectedSequenceEntry[SequenceEntriesView.groupIdColumn],
processedData = compressionService.decompressProcessedData(
processedData = processedDataPostprocessor.retrieveFromStoredValue(
selectedSequenceEntry[SequenceEntriesView.processedDataColumn]!!,
organism,
),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
package org.loculus.backend.service

import com.fasterxml.jackson.databind.node.NullNode
import com.fasterxml.jackson.databind.node.TextNode
import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.Assertions.assertFalse
import org.junit.jupiter.api.Assertions.assertTrue
import org.junit.jupiter.api.Test
import org.loculus.backend.SpringBootTestWithoutDatabase
import org.loculus.backend.api.Organism
import org.loculus.backend.api.OriginalData
import org.loculus.backend.api.ProcessedData
import org.loculus.backend.config.BackendConfig
import org.loculus.backend.service.submission.CompressionService
import org.springframework.beans.factory.annotation.Autowired
Expand All @@ -36,46 +31,4 @@ class CompressionServiceTest(

assertEquals(testData, decompressed)
}

@Test
fun `Metadata handling in compression and decompression of ProcessedData`() {
val organism = Organism(backendConfig.organisms.keys.first())
val configuredFields = backendConfig.getInstanceConfig(organism).schema.metadata.map { it.name }
require(configuredFields.size >= 2) { "Test requires at least 2 configured metadata fields" }

val configuredPresent = configuredFields[0]
val configuredNull = configuredFields[1]
val unconfiguredPresent = "unconfigured_present"
val unconfiguredNull = "unconfigured_null"

val testData = ProcessedData<String>(
metadata = mapOf(
configuredPresent to TextNode("value1"),
configuredNull to NullNode.instance,
unconfiguredPresent to TextNode("value2"),
unconfiguredNull to NullNode.instance,
),
unalignedNucleotideSequences = emptyMap(),
alignedNucleotideSequences = emptyMap(),
nucleotideInsertions = emptyMap(),
alignedAminoAcidSequences = emptyMap(),
aminoAcidInsertions = emptyMap(),
)

val compressed = compressor.compressProcessedData(testData, organism)
val decompressed = compressor.decompressProcessedData(compressed, organism)

// Check compression behavior
assertFalse(compressed.metadata.containsKey(configuredNull))
assertFalse(compressed.metadata.containsKey(unconfiguredNull))
assertTrue(compressed.metadata.containsKey(configuredPresent))
assertTrue(compressed.metadata.containsKey(unconfiguredPresent))
assertEquals(compressed.metadata[configuredPresent], testData.metadata[configuredPresent])

// Check decompression behavior
assertEquals(decompressed.metadata[configuredPresent], testData.metadata[configuredPresent])
assertEquals(decompressed.metadata[configuredNull], testData.metadata[configuredNull])
assertFalse(decompressed.metadata.containsKey(unconfiguredPresent))
assertFalse(decompressed.metadata.containsKey(unconfiguredNull))
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package org.loculus.backend.service

import com.fasterxml.jackson.databind.node.NullNode
import com.fasterxml.jackson.databind.node.TextNode
import org.hamcrest.MatcherAssert.assertThat
import org.hamcrest.Matchers.hasKey
import org.hamcrest.Matchers.not
import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.Test
import org.loculus.backend.SpringBootTestWithoutDatabase
import org.loculus.backend.api.Organism
import org.loculus.backend.api.ProcessedData
import org.loculus.backend.config.BackendConfig
import org.loculus.backend.service.submission.ProcessedMetadataPostprocessor
import org.springframework.beans.factory.annotation.Autowired

@SpringBootTestWithoutDatabase
class ProcessedMetadataPostprocessorTest(
@Autowired private val processedMetadataPostprocessor: ProcessedMetadataPostprocessor,
@Autowired private val backendConfig: BackendConfig,
) {

@Test
fun `Processed Metadata Postprocessor correctly round trips metadata`() {
val organism = Organism(backendConfig.organisms.keys.first())
val configuredFields = backendConfig.getInstanceConfig(organism).schema.metadata.map { it.name }
require(configuredFields.size >= 2) { "Test requires at least 2 configured metadata fields" }

val configuredPresent = configuredFields[0]
val configuredNull = configuredFields[1]
val unconfiguredPresent = "unconfigured_present"
val unconfiguredNull = "unconfigured_null"

val testData = ProcessedData<String>(
metadata = mapOf(
configuredPresent to TextNode("value1"),
configuredNull to NullNode.instance,
unconfiguredPresent to TextNode("value2"),
unconfiguredNull to NullNode.instance,
),
unalignedNucleotideSequences = emptyMap(),
alignedNucleotideSequences = emptyMap(),
nucleotideInsertions = emptyMap(),
alignedAminoAcidSequences = emptyMap(),
aminoAcidInsertions = emptyMap(),
)

// "Compression" is only used in the sense that we are removing null values
val compressed = processedMetadataPostprocessor.stripNullValuesFromMetadata(testData)
val decompressed = processedMetadataPostprocessor.filterOutExtraFieldsAndAddNulls(compressed, organism)

// Check compression behavior
assertThat(compressed.metadata, not(hasKey(configuredNull)))
assertThat(compressed.metadata, not(hasKey(unconfiguredNull)))
assertThat(compressed.metadata, hasKey(configuredPresent))
assertThat(compressed.metadata, hasKey(unconfiguredPresent))
assertEquals(compressed.metadata[configuredPresent], testData.metadata[configuredPresent])

// Check decompression behavior
assertEquals(decompressed.metadata[configuredPresent], testData.metadata[configuredPresent])
assertEquals(decompressed.metadata[configuredNull], testData.metadata[configuredNull])
assertThat(decompressed.metadata, not(hasKey(unconfiguredPresent)))
assertThat(decompressed.metadata, not(hasKey(unconfiguredNull)))
}
}
Loading