Skip to content

Commit

Permalink
feat(backend): facilitate large compressed uploads
Browse files Browse the repository at this point in the history
 * introduce two auxiliary tables to efficiently validate and merge metadata and sequence data
 * remove singleton SequenceEntriesTable and replace it with provided and cached `Table`s to facilitate compression of sequence data
 * de-compress sequence strings with custom dictionary when de-serializing
 * support for zstd, gzip, xz, lzma, zip, bzip2
  • Loading branch information
TobiasKampmann committed Dec 5, 2023
1 parent bf8b8f4 commit 73c35eb
Show file tree
Hide file tree
Showing 46 changed files with 1,563 additions and 719 deletions.
8 changes: 6 additions & 2 deletions backend/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,14 @@ dependencies {
implementation "org.jetbrains.exposed:exposed-kotlin-datetime:$exposedVersion"
implementation "org.jetbrains.kotlinx:kotlinx-datetime:0.5.0"
implementation "org.hibernate.validator:hibernate-validator:8.0.1.Final"

implementation "org.springframework.boot:spring-boot-starter-oauth2-resource-server"
implementation "org.springframework.boot:spring-boot-starter-security"


implementation 'org.apache.commons:commons-compress:1.25.0'
implementation 'com.github.luben:zstd-jni:1.5.2-4'
implementation 'org.tukaani:xz:1.8'

testImplementation("org.springframework.boot:spring-boot-starter-test") {
exclude group: "org.mockito"
}
Expand Down
3 changes: 2 additions & 1 deletion backend/docs/runtime_view.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

## Initial submission

To submit new sequences, the user calls the `/submit` endpoint and sends unpreprocessed data.
To submit new sequences, the user calls the `/submit` endpoint and sends unpreprocessed data.
Data may be compressed using zstd, gzip, bzip2, xz, lzma or zip.
For each sequence, Pathoplexus creates a new row in the "sequenceEntries" table.
It generates a new accession. The version number of the sequence entries is 1.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import com.fasterxml.jackson.databind.JsonDeserializer
import com.fasterxml.jackson.databind.JsonNode
import com.fasterxml.jackson.databind.annotation.JsonDeserialize
import io.swagger.v3.oas.annotations.media.Schema
import org.pathoplexus.backend.service.Accession
import org.pathoplexus.backend.service.Version
import org.pathoplexus.backend.utils.Accession
import org.pathoplexus.backend.utils.Version

interface AccessionVersionInterface {
val accession: Accession
Expand Down Expand Up @@ -154,11 +154,6 @@ data class RevisedData(
val originalData: OriginalData,
)

data class SubmittedData(
val submissionId: String,
val originalData: OriginalData,
)

data class UnprocessedData(
@Schema(example = "123") override val accession: Accession,
@Schema(example = "1") override val version: Version,
Expand All @@ -175,7 +170,7 @@ data class OriginalData(
example = "{\"segment1\": \"ACTG\", \"segment2\": \"GTCA\"}",
description = "The key is the segment name, the value is the nucleotide sequence",
)
val unalignedNucleotideSequences: Map<String, String>,
val unalignedNucleotideSequences: Map<SegmentName, NucleotideSequence>,
)

enum class Status {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
package org.pathoplexus.backend.config

import org.pathoplexus.backend.api.Organism

data class BackendConfig(
val instances: Map<String, InstanceConfig>,
)
) {
fun getInstanceConfig(organism: Organism) =
instances[organism.name] ?: throw IllegalArgumentException(
"Organism: ${organism.name} not found in backend config. Available organisms: ${instances.keys}",
)
}

data class InstanceConfig(
val schema: Schema,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ data class ReferenceGenome(
throw IllegalArgumentException("If there is only one nucleotide sequence, it must be named 'main'")
}
}

fun getNucleotideSegmentReference(segmentName: String): NucleotideSequence? = nucleotideSequences.find {
it.name == segmentName
}?.sequence
}

data class ReferenceSequence(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ class ExceptionHandler : ResponseEntityExceptionHandler() {
return ResponseEntity.status(HttpStatus.UNAUTHORIZED).build()
}

@ExceptionHandler(UnprocessableEntityException::class, ProcessingValidationException::class)
@ExceptionHandler(
UnprocessableEntityException::class,
ProcessingValidationException::class,
DuplicateKeyException::class,
)
@ResponseStatus(HttpStatus.UNPROCESSABLE_ENTITY)
fun handleUnprocessableEntityException(e: Exception): ResponseEntity<ProblemDetail> {
log.warn(e) { "Caught unprocessable entity exception: ${e.message}" }
Expand Down Expand Up @@ -144,5 +148,6 @@ class ForbiddenException(message: String) : RuntimeException(message)
class UnprocessableEntityException(message: String) : RuntimeException(message)
class NotFoundException(message: String) : RuntimeException(message)
class ProcessingValidationException(message: String) : RuntimeException(message)
class DuplicateKeyException(message: String) : RuntimeException(message)

class DummyUnauthorizedExceptionToMakeItAppearInSwaggerUi : RuntimeException()
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ import org.pathoplexus.backend.api.SubmittedProcessedData
import org.pathoplexus.backend.api.UnprocessedData
import org.pathoplexus.backend.model.ReleasedDataModel
import org.pathoplexus.backend.model.SubmitModel
import org.pathoplexus.backend.service.Accession
import org.pathoplexus.backend.service.DatabaseService
import org.pathoplexus.backend.utils.Accession
import org.pathoplexus.backend.utils.IteratorStreamer
import org.springframework.http.HttpHeaders
import org.springframework.http.HttpStatus
Expand All @@ -39,6 +39,7 @@ import org.springframework.web.bind.annotation.ResponseStatus
import org.springframework.web.bind.annotation.RestController
import org.springframework.web.multipart.MultipartFile
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody
import java.util.UUID
import io.swagger.v3.oas.annotations.parameters.RequestBody as SwaggerRequestBody

@RestController
Expand All @@ -61,9 +62,13 @@ class SubmissionController(
@UsernameFromJwt username: String,
@Parameter(description = METADATA_FILE_DESCRIPTION) @RequestParam metadataFile: MultipartFile,
@Parameter(description = SEQUENCE_FILE_DESCRIPTION) @RequestParam sequenceFile: MultipartFile,
): List<SubmissionIdMapping> {
return submitModel.processSubmission(username, metadataFile, sequenceFile, organism)
}
): List<SubmissionIdMapping> = submitModel.processSubmission(
UUID.randomUUID().toString(),
metadataFile,
sequenceFile,
username,
organism,
)

@Operation(description = EXTRACT_UNPROCESSED_DATA_DESCRIPTION)
@ApiResponse(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ You can use this response to associate the user provided submissionId with the s
"""

const val METADATA_FILE_DESCRIPTION = """
A TSV (tab separated values) file containing the metadata of the submitted sequence entries.
A TSV (tab separated values) file containing the metadata of the submitted sequence entries.
The file may be compressed with zstd, xz, zip, gzip, lzma, bzip2 (with common extensions).
It must contain the column names.
The field 'submissionId' is required and must be unique within the provided dataset.
It is used to associate metadata to the sequences in the sequences fasta file.
"""
const val SEQUENCE_FILE_DESCRIPTION = """
A fasta file containing the unaligned nucleotide sequences of the submitted sequences.
The file may be compressed with zstd, xz, zip, gzip, lzma, bzip2 (with common extensions).
If the underlying organism has a single segment,
the headers of the fasta file must match the 'submissionId' field in the metadata file.
If the underlying organism has multiple segments,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ import mu.KotlinLogging
import org.pathoplexus.backend.api.Organism
import org.pathoplexus.backend.api.ProcessedData
import org.pathoplexus.backend.api.SiloVersionStatus
import org.pathoplexus.backend.service.Accession
import org.pathoplexus.backend.service.DatabaseService
import org.pathoplexus.backend.service.RawProcessedData
import org.pathoplexus.backend.service.Version
import org.pathoplexus.backend.utils.Accession
import org.pathoplexus.backend.utils.Version
import org.springframework.stereotype.Service
import org.springframework.transaction.annotation.Transactional

Expand Down
Loading

0 comments on commit 73c35eb

Please sign in to comment.