@gmod/gff
Version:
read and write GFF3 data as streams
305 lines (280 loc) • 8.7 kB
text/typescript
import { createReadStream } from 'fs'
import { readFile } from 'fs/promises'
import { Readable } from 'stream'
import { TransformStream } from 'stream/web'
import { GFFTransformer, parseStringSync } from './api'
import { formatFeature } from './util'
import type {
GFF3Feature,
GFF3Comment,
GFF3Directive,
GFF3Sequence,
} from './util'
interface ReadAllResults {
features: GFF3Feature[]
comments: GFF3Comment[]
directives: GFF3Directive[]
sequences: GFF3Sequence[]
all: (GFF3Feature | GFF3Comment | GFF3Directive | GFF3Sequence)[]
}
async function readAll(
filename: string,
args: Record<string, unknown> = {},
): Promise<ReadAllResults> {
const stuff: ReadAllResults = {
features: [],
comments: [],
directives: [],
sequences: [],
all: [],
}
const stream = Readable.toWeb(createReadStream(require.resolve(filename)))
const transformer = new GFFTransformer({
parseFeatures: true,
parseDirectives: true,
parseComments: true,
parseSequences: true,
bufferSize: 10,
...args,
})
const transformStream = new TransformStream(transformer)
const gffStream = stream.pipeThrough(transformStream)
for await (const value of gffStream) {
stuff.all.push(value)
if ('directive' in value) {
stuff.directives.push(value)
} else if ('comment' in value) {
stuff.comments.push(value)
} else if ('sequence' in value) {
stuff.sequences.push(value)
} else {
stuff.features.push(value)
}
}
return stuff
}
describe('GFF3 parser', () => {
it('can parse gff3_with_syncs.gff3', async () => {
const stuff = await readAll('../test/data/gff3_with_syncs.gff3')
const referenceResult = JSON.parse(
await readFile(
require.resolve('../test/data/gff3_with_syncs.result.json'),
'utf8',
),
)
expect(stuff.all).toEqual(referenceResult)
})
;[
[1010, 'messy_protein_domains.gff3'],
[4, 'gff3_with_syncs.gff3'],
[51, 'au9_scaffold_subset.gff3'],
[14, 'tomato_chr4_head.gff3'],
[5, 'directives.gff3'],
[6, 'hybrid1.gff3'],
[3, 'hybrid2.gff3'],
[6, 'knownGene.gff3'],
[6, 'knownGene2.gff3'],
[16, 'tomato_test.gff3'],
[3, 'spec_eden.gff3'],
[1, 'spec_match.gff3'],
[8, 'quantitative.gff3'],
].forEach(([count, filename]) => {
it(`can cursorily parse ${filename}`, async () => {
const stuff = await readAll(`../test/data/${filename}`)
expect(stuff.all.length).toEqual(count)
})
})
it('supports children before parents, and Derives_from', async () => {
const stuff = await readAll('../test/data/knownGene_out_of_order.gff3')
// $p->max_lookback(2);
const expectedOutput = JSON.parse(
await readFile(
require.resolve('../test/data/knownGene_out_of_order.result.json'),
'utf8',
),
)
expect(stuff.all).toEqual(expectedOutput)
})
it('can parse the EDEN gene from the gff3 spec', async () => {
const stuff = await readAll('../test/data/spec_eden.gff3')
expect(stuff.all[2]).toHaveLength(1)
const [eden] = stuff.all[2] as GFF3Feature
expect(eden.child_features).toHaveLength(4)
expect(eden.child_features[0][0].type).toEqual('TF_binding_site')
// all the rest are mRNAs
const mrnas = eden.child_features.slice(1, 4)
expect(mrnas.filter((m) => m.length === 1)).toHaveLength(3)
const mrnaLines = mrnas.map((m) => {
expect(m).toHaveLength(1)
return m[0]
})
mrnaLines.forEach((m) => {
expect(m.type).toEqual('mRNA')
})
// check that all the mRNAs share the last exon
const lastExon = mrnaLines[2].child_features[3]
expect(mrnaLines[0].child_features).toContain(lastExon)
expect(mrnaLines[1].child_features).toContain(lastExon)
expect(mrnaLines[2].child_features).toContain(lastExon)
expect(mrnaLines[0].child_features).toHaveLength(5)
expect(mrnaLines[1].child_features).toHaveLength(4)
expect(mrnaLines[2].child_features).toHaveLength(6)
const referenceResult = JSON.parse(
await readFile(
require.resolve('../test/data/spec_eden.result.json'),
'utf8',
),
)
expect(stuff.all).toEqual(referenceResult)
})
it('can parse an excerpt of the refGene gff3', async () => {
const stuff = await readAll('../test/data/refGene_excerpt.gff3')
expect(true).toBeTruthy()
expect(stuff.all).toHaveLength(2)
})
it('can parse an excerpt of the TAIR10 gff3', async () => {
const stuff = await readAll('../test/data/tair10.gff3')
expect(stuff.all).toHaveLength(3)
})
it('can parse chr1 TAIR10 gff3', async () => {
const stuff = await readAll('../test/data/tair10_chr1.gff', {
disableDerivesFromReferences: true,
})
expect(stuff.all).toHaveLength(17697)
}, 10000)
// check that some files throw a parse error
;['mm9_sample_ensembl.gff3', 'Saccharomyces_cerevisiae_EF3_e64.gff3'].forEach(
(errorFile) => {
it(`throws an error when parsing ${errorFile}`, async () => {
await expect(readAll(`../test/data/${errorFile}`)).rejects.toMatch(
/inconsistent types/,
)
})
},
)
it('can parse a string synchronously', async () => {
const gff3 = await readFile(
require.resolve('../test/data/spec_eden.gff3'),
'utf8',
)
const result = parseStringSync(gff3, {
parseFeatures: true,
parseDirectives: true,
parseComments: true,
})
expect(result).toHaveLength(3)
const referenceResult = JSON.parse(
await readFile(
require.resolve('../test/data/spec_eden.result.json'),
'utf8',
),
)
expect(result).toEqual(referenceResult)
})
it('can parse some whitespace', () => {
const gff3 = `
SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . multivalue = val1,val2, val3;testing = blah
`
const result = parseStringSync(gff3, {
parseFeatures: true,
parseDirectives: true,
parseComments: true,
})
expect(result).toHaveLength(1)
const referenceResult = [
[
{
seq_id: 'SL2.40%ch01',
source: 'IT%AG eugene',
type: 'g%e;ne',
start: 80999140,
end: 81004317,
score: null,
strand: '+',
phase: null,
attributes: {
multivalue: ['val1', 'val2', 'val3'],
testing: ['blah'],
},
child_features: [],
derived_features: [],
},
],
]
expect(result).toEqual(referenceResult)
})
it('can parse another string synchronously', () => {
const gff3 = `
SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . Alias=Solyc01g098840;ID=gene:Solyc01g098840.2;Name=Solyc01g098840.2;from_BOGAS=1;length=5178
`
const result = parseStringSync(gff3, {
parseFeatures: true,
parseDirectives: true,
parseComments: true,
})
expect(result).toHaveLength(1)
const referenceResult = [
[
{
seq_id: 'SL2.40%ch01',
source: 'IT%AG eugene',
type: 'g%e;ne',
start: 80999140,
end: 81004317,
score: null,
strand: '+',
phase: null,
attributes: {
Alias: ['Solyc01g098840'],
ID: ['gene:Solyc01g098840.2'],
Name: ['Solyc01g098840.2'],
from_BOGAS: ['1'],
length: ['5178'],
},
child_features: [],
derived_features: [],
},
],
]
expect(result).toEqual(referenceResult)
expect(`\n${formatFeature(referenceResult[0])}`).toEqual(gff3)
})
;(
[
[
'hybrid1.gff3',
[
{
id: 'A00469',
sequence: 'GATTACAGATTACA',
},
{
id: 'zonker',
sequence:
'AAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCCAAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCC',
},
{
id: 'zeebo',
description: 'this is a test description',
sequence:
'AAAAACTAGTAGCTAGCTAGCTGATCATAGATCGATGCATGGCATACTGACTGATCGACCCCCC',
},
],
],
[
'hybrid2.gff3',
[
{
id: 'A00469',
sequence: 'GATTACAWATTACABATTACAGATTACA',
},
],
],
] as const
).forEach(([filename, expectedOutput]) => {
it(`can parse FASTA sections in hybrid ${filename} file`, async () => {
const stuff = await readAll(`../test/data/${filename}`)
expect(stuff.sequences).toEqual(expectedOutput)
})
})
})