Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add feature import conversion tests #441

Merged
merged 8 commits into from
Sep 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/apollo-cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"@istanbuljs/esm-loader-hook": "^0.2.0",
"@istanbuljs/nyc-config-typescript": "^1.0.2",
"@oclif/test": "^3.1.3",
"@types/chai": "^4",
"@types/chai": "^4.3.19",
"@types/cli-progress": "^3",
"@types/inquirer": "^9.0.7",
"@types/mocha": "^10",
Expand Down
3 changes: 3 additions & 0 deletions packages/apollo-shared/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@
"devDependencies": {
"@nestjs/common": "^10.1.0",
"@nestjs/core": "^10.1.0",
"@types/chai": "^4.3.19",
"@types/node": "^18.14.2",
"@types/rimraf": "^3",
"chai": "^5.1.1",
"chai-exclude": "^3.0.0",
"glob": "^11.0.0",
"mobx": "^6.6.1",
"mobx-state-tree": "^5.1.7",
Expand Down
170 changes: 165 additions & 5 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
/* eslint-disable @typescript-eslint/no-floating-promises */
import { strict as assert } from 'node:assert'
import { describe, it } from 'node:test'
import gff from '@gmod/gff'
import { readFileSync } from 'node:fs'

import gff, { GFF3Feature } from '@gmod/gff'
import { assert, use } from 'chai'
import chaiExclude from 'chai-exclude'

import { gff3ToAnnotationFeature } from './gff3ToAnnotationFeature'
import { AnnotationFeatureSnapshot } from '@apollo-annotation/mst'

use(chaiExclude)

const testCases: [string, string, AnnotationFeatureSnapshot][] = [
[
'a feature with no children',
Expand All @@ -23,18 +28,173 @@ const testCases: [string, string, AnnotationFeatureSnapshot][] = [
},
},
],
[
'a feature with two children',
`ctgA est EST_match 1050 3202 . + . ID=Match1;Name=agt830.5;Target=agt830.5 1 654
ctgA est match_part 1050 1500 . + . Parent=Match1;Name=agt830.5;Target=agt830.5 1 451
ctgA est match_part 3000 3202 . + . Parent=Match1;Name=agt830.5;Target=agt830.5 452 654
`,
{
_id: '66cf9fbb4e947fa2c27d3d6a',
refSeq: 'ctgA',
type: 'EST_match',
min: 1049,
max: 3202,
strand: 1,
children: {
'66cf9fbb4e947fa2c27d3d68': {
_id: '66cf9fbb4e947fa2c27d3d68',
refSeq: 'ctgA',
type: 'match_part',
min: 1049,
max: 1500,
strand: 1,
attributes: {
gff_source: ['est'],
gff_name: ['agt830.5'],
gff_target: ['agt830.5 1 451'],
},
},
'66cf9fbb4e947fa2c27d3d69': {
_id: '66cf9fbb4e947fa2c27d3d69',
refSeq: 'ctgA',
type: 'match_part',
min: 2999,
max: 3202,
strand: 1,
attributes: {
gff_source: ['est'],
gff_name: ['agt830.5'],
gff_target: ['agt830.5 452 654'],
},
},
},
attributes: {
gff_source: ['est'],
gff_id: ['Match1'],
gff_name: ['agt830.5'],
gff_target: ['agt830.5 1 654'],
},
},
],
]

interface AnnotationFeatureSnapshotWithChildrenArray
extends Omit<AnnotationFeatureSnapshot, 'children'> {
children?: AnnotationFeatureSnapshotWithChildrenArray[]
}

function childrenToArray(
feature: AnnotationFeatureSnapshot,
): AnnotationFeatureSnapshotWithChildrenArray {
const { children } = feature
if (!children) {
return feature as AnnotationFeatureSnapshotWithChildrenArray
}
const childrenArray = Object.values(children).map((child) =>
childrenToArray(child),
)
return { ...feature, children: childrenArray }
}

function compareFeatures(
feature1: AnnotationFeatureSnapshot,
feature2: AnnotationFeatureSnapshot,
) {
assert.deepEqual(
{ ...feature1, _id: undefined },
{ ...feature2, _id: undefined },
assert.deepEqualExcludingEvery(
childrenToArray(feature1),
childrenToArray(feature2),
'_id',
)
}

function readFeatureFile(fn: string): GFF3Feature[] {
const lines = readFileSync(fn).toString().split('\n')
const feature: string[] = []
for (const line of lines) {
if (!line.startsWith('#')) {
feature.push(line)
}
}
const inGff = gff.parseStringSync(feature.join('\n')) as GFF3Feature[]
return inGff
}

function readAnnotationFeatureSnapshot(fn: string): AnnotationFeatureSnapshot {
const lines = readFileSync(fn).toString()
return JSON.parse(lines) as AnnotationFeatureSnapshot
}

const [ex1, ex2, ex3, ex4] = readFeatureFile(
'test_data/gene_representations.gff3',
)

describe('gff3ToAnnotationFeature examples', () => {
it('Convert one CDS', () => {
const actual = gff3ToAnnotationFeature(
readFeatureFile('test_data/one_cds.gff3')[0],
)
const expected = readAnnotationFeatureSnapshot('test_data/one_cds.json')
compareFeatures(actual, expected)
})
it('Convert two CDSs', () => {
const actual = gff3ToAnnotationFeature(
readFeatureFile('test_data/two_cds.gff3')[0],
)
const expected = readAnnotationFeatureSnapshot('test_data/two_cds.json')
compareFeatures(actual, expected)
})
it('Convert example 1', () => {
const actual = gff3ToAnnotationFeature(ex1)
const txt = JSON.stringify(actual, null, 2)

assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)
assert.equal(txt.match(/"type": "TF_binding_site"/g)?.length, 1)

const expected = readAnnotationFeatureSnapshot('test_data/example01.json')
compareFeatures(actual, expected)
})
it('Convert example 2', () => {
const actual = gff3ToAnnotationFeature(ex2)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)
const expected = readAnnotationFeatureSnapshot('test_data/example02.json')
compareFeatures(actual, expected)
})
it('Convert example 3', () => {
// NB: In example 3 (and in the other examples) mRNA10003 produces two proteins.
// In the other examples the two proteins are identified by sharing the same cds id.
// In example 3 instead each cds has a unique id so the two proteins are identified by the order they
// appear in the gff.
const actual = gff3ToAnnotationFeature(ex3)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)

// const expected = readAnnotationFeatureSnapshot('test_data/example03.json')
// compareFeatures(actual, expected)
})
it('Convert example 4', () => {
const ft = JSON.stringify(ex4, null, 2)
assert.equal(ft.match(/"type": "five_prime_UTR"/g)?.length, 6)
assert.equal(ft.match(/"type": "three_prime_UTR"/g)?.length, 3)

const actual = gff3ToAnnotationFeature(ex4)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/"type": "CDS"/g)?.length, 4)
assert.equal(txt.match(/prime_UTR/g), null)

const expected = readAnnotationFeatureSnapshot('test_data/example04.json')
compareFeatures(actual, expected)
})
it('Convert braker gff', () => {
const [gffFeature] = readFeatureFile('test_data/braker.gff')
const actual = gff3ToAnnotationFeature(gffFeature)
const txt = JSON.stringify(actual, null, 2)
assert.equal(txt.match(/intron/g), null)
assert.equal(txt.match(/_codon/g), null)
})
})

describe('gff3ToAnnotationFeature', () => {
for (const testCase of testCases) {
const [description, featureLine, convertedFeature] = testCase
Expand Down
20 changes: 10 additions & 10 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,10 @@ function convertChildren(
const [firstChildFeatureLocation] = childFeature
if (
firstChildFeatureLocation.type === 'three_prime_UTR' ||
firstChildFeatureLocation.type === 'five_prime_UTR'
firstChildFeatureLocation.type === 'five_prime_UTR' ||
firstChildFeatureLocation.type === 'intron' ||
firstChildFeatureLocation.type === 'start_codon' ||
firstChildFeatureLocation.type === 'stop_codon'
) {
continue
}
Expand Down Expand Up @@ -232,20 +235,17 @@ function processCDS(
groupedLocations.push([location])
continue
}
const lastGroupLastLocation = lastGroup.at(-1)
if (!lastGroupLastLocation) {
throw new Error('Got group with no locations')
}
if (
const overlaps = lastGroup.some((lastGroupLoc) =>
doesIntersect2(
/* eslint-disable @typescript-eslint/no-non-null-assertion */
lastGroupLastLocation.start!,
lastGroupLastLocation.end!,
lastGroupLoc.start!,
lastGroupLoc.end!,
location.start!,
location.end!,
/* eslint-enable @typescript-eslint/no-non-null-assertion */
)
) {
),
)
if (overlaps) {
groupedLocations.push([location])
} else {
lastGroup.push(location)
Expand Down
13 changes: 13 additions & 0 deletions packages/apollo-shared/test_data/braker.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##gff-version 3
CM033580.1 AUGUSTUS gene 15529 16566 0.92 - . ID=g1;
CM033580.1 AUGUSTUS mRNA 15529 16566 0.92 - . ID=g1.t1;Parent=g1;
CM033580.1 AUGUSTUS stop_codon 15529 15531 . - 0 ID=g1.t1.stop1;Parent=g1.t1;
CM033580.1 AUGUSTUS CDS 15529 15659 0.92 - 2 ID=g1.t1.CDS1;Parent=g1.t1;
CM033580.1 AUGUSTUS exon 15529 15659 . - . ID=g1.t1.exon1;Parent=g1.t1;
CM033580.1 AUGUSTUS intron 15660 16112 0.96 - . ID=g1.t1.intron1;Parent=g1.t1;
CM033580.1 AUGUSTUS CDS 16113 16314 0.96 - 0 ID=g1.t1.CDS2;Parent=g1.t1;
CM033580.1 AUGUSTUS exon 16113 16314 . - . ID=g1.t1.exon2;Parent=g1.t1;
CM033580.1 AUGUSTUS intron 16315 16536 0.96 - . ID=g1.t1.intron2;Parent=g1.t1;
CM033580.1 AUGUSTUS CDS 16537 16566 0.99 - 0 ID=g1.t1.CDS3;Parent=g1.t1;
CM033580.1 AUGUSTUS exon 16537 16566 . - . ID=g1.t1.exon3;Parent=g1.t1;
CM033580.1 AUGUSTUS start_codon 16564 16566 . - 0 ID=g1.t1.start1;Parent=g1.t1;
26 changes: 26 additions & 0 deletions packages/apollo-shared/test_data/example01.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
##gff-version 3
##sequence-region chr1 1000 9000
#example01
chr1 . gene 1000 9000 . + . ID=gene10001;Name=EDEN
chr1 . TF_binding_site 1000 1012 . + . ID=tfbs10001;Parent=gene10001
chr1 . mRNA 1050 9000 . + . ID=mRNA10001;Parent=gene10001;Name=EDEN.1
chr1 . mRNA 1050 9000 . + . ID=mRNA10002;Parent=gene10001;Name=EDEN.2
chr1 . mRNA 1300 9000 . + . ID=mRNA10003;Parent=gene10001;Name=EDEN.3
chr1 . exon 1050 1500 . + . ID=exon10001;Parent=mRNA10001,mRNA10002
chr1 . exon 1300 1500 . + . ID=exon10002;Parent=mRNA10003
chr1 . exon 3000 3902 . + . ID=exon10003;Parent=mRNA10001,mRNA10003
chr1 . exon 5000 5500 . + . ID=exon10004;Parent=mRNA10001,mRNA10002,mRNA10003
chr1 . exon 7000 9000 . + . ID=exon10005;Parent=mRNA10001,mRNA10002,mRNA10003
chr1 . CDS 1201 1500 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 3000 3902 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 5000 5500 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 7000 7600 . + 0 ID=cds10001;Parent=mRNA10001;Name=edenprotein.1
chr1 . CDS 1201 1500 . + 0 ID=cds10002;Parent=mRNA10002;Name=edenprotein.2
chr1 . CDS 5000 5500 . + 0 ID=cds10002;Parent=mRNA10002;Name=edenprotein.2
chr1 . CDS 7000 7600 . + 0 ID=cds10002;Parent=mRNA10002;Name=edenprotein.2
chr1 . CDS 3301 3902 . + 0 ID=cds10003;Parent=mRNA10003;Name=edenprotein.3
chr1 . CDS 5000 5500 . + 1 ID=cds10003;Parent=mRNA10003;Name=edenprotein.3
chr1 . CDS 7000 7600 . + 1 ID=cds10003;Parent=mRNA10003;Name=edenprotein.3
chr1 . CDS 3391 3902 . + 0 ID=cds10004;Parent=mRNA10003;Name=edenprotein.4
chr1 . CDS 5000 5500 . + 1 ID=cds10004;Parent=mRNA10003;Name=edenprotein.4
chr1 . CDS 7000 7600 . + 1 ID=cds10004;Parent=mRNA10003;Name=edenprotein.4
Loading