UNPKG

@gmod/gff

Version:

read and write GFF3 data as streams

305 lines (280 loc) 8.7 kB
import { createReadStream } from 'fs' import { readFile } from 'fs/promises' import { Readable } from 'stream' import { TransformStream } from 'stream/web' import { GFFTransformer, parseStringSync } from './api' import { formatFeature } from './util' import type { GFF3Feature, GFF3Comment, GFF3Directive, GFF3Sequence, } from './util' interface ReadAllResults { features: GFF3Feature[] comments: GFF3Comment[] directives: GFF3Directive[] sequences: GFF3Sequence[] all: (GFF3Feature | GFF3Comment | GFF3Directive | GFF3Sequence)[] } async function readAll( filename: string, args: Record<string, unknown> = {}, ): Promise<ReadAllResults> { const stuff: ReadAllResults = { features: [], comments: [], directives: [], sequences: [], all: [], } const stream = Readable.toWeb(createReadStream(require.resolve(filename))) const transformer = new GFFTransformer({ parseFeatures: true, parseDirectives: true, parseComments: true, parseSequences: true, bufferSize: 10, ...args, }) const transformStream = new TransformStream(transformer) const gffStream = stream.pipeThrough(transformStream) for await (const value of gffStream) { stuff.all.push(value) if ('directive' in value) { stuff.directives.push(value) } else if ('comment' in value) { stuff.comments.push(value) } else if ('sequence' in value) { stuff.sequences.push(value) } else { stuff.features.push(value) } } return stuff } describe('GFF3 parser', () => { it('can parse gff3_with_syncs.gff3', async () => { const stuff = await readAll('../test/data/gff3_with_syncs.gff3') const referenceResult = JSON.parse( await readFile( require.resolve('../test/data/gff3_with_syncs.result.json'), 'utf8', ), ) expect(stuff.all).toEqual(referenceResult) }) ;[ [1010, 'messy_protein_domains.gff3'], [4, 'gff3_with_syncs.gff3'], [51, 'au9_scaffold_subset.gff3'], [14, 'tomato_chr4_head.gff3'], [5, 'directives.gff3'], [6, 'hybrid1.gff3'], [3, 'hybrid2.gff3'], [6, 'knownGene.gff3'], [6, 'knownGene2.gff3'], [16, 'tomato_test.gff3'], [3, 'spec_eden.gff3'], [1, 'spec_match.gff3'], [8, 'quantitative.gff3'], ].forEach(([count, filename]) => { it(`can cursorily parse ${filename}`, async () => { const stuff = await readAll(`../test/data/${filename}`) expect(stuff.all.length).toEqual(count) }) }) it('supports children before parents, and Derives_from', async () => { const stuff = await readAll('../test/data/knownGene_out_of_order.gff3') // $p->max_lookback(2); const expectedOutput = JSON.parse( await readFile( require.resolve('../test/data/knownGene_out_of_order.result.json'), 'utf8', ), ) expect(stuff.all).toEqual(expectedOutput) }) it('can parse the EDEN gene from the gff3 spec', async () => { const stuff = await readAll('../test/data/spec_eden.gff3') expect(stuff.all[2]).toHaveLength(1) const [eden] = stuff.all[2] as GFF3Feature expect(eden.child_features).toHaveLength(4) expect(eden.child_features[0][0].type).toEqual('TF_binding_site') // all the rest are mRNAs const mrnas = eden.child_features.slice(1, 4) expect(mrnas.filter((m) => m.length === 1)).toHaveLength(3) const mrnaLines = mrnas.map((m) => { expect(m).toHaveLength(1) return m[0] }) mrnaLines.forEach((m) => { expect(m.type).toEqual('mRNA') }) // check that all the mRNAs share the last exon const lastExon = mrnaLines[2].child_features[3] expect(mrnaLines[0].child_features).toContain(lastExon) expect(mrnaLines[1].child_features).toContain(lastExon) expect(mrnaLines[2].child_features).toContain(lastExon) expect(mrnaLines[0].child_features).toHaveLength(5) expect(mrnaLines[1].child_features).toHaveLength(4) expect(mrnaLines[2].child_features).toHaveLength(6) const referenceResult = JSON.parse( await readFile( require.resolve('../test/data/spec_eden.result.json'), 'utf8', ), ) expect(stuff.all).toEqual(referenceResult) }) it('can parse an excerpt of the refGene gff3', async () => { const stuff = await readAll('../test/data/refGene_excerpt.gff3') expect(true).toBeTruthy() expect(stuff.all).toHaveLength(2) }) it('can parse an excerpt of the TAIR10 gff3', async () => { const stuff = await readAll('../test/data/tair10.gff3') expect(stuff.all).toHaveLength(3) }) it('can parse chr1 TAIR10 gff3', async () => { const stuff = await readAll('../test/data/tair10_chr1.gff', { disableDerivesFromReferences: true, }) expect(stuff.all).toHaveLength(17697) }, 10000) // check that some files throw a parse error ;['mm9_sample_ensembl.gff3', 'Saccharomyces_cerevisiae_EF3_e64.gff3'].forEach( (errorFile) => { it(`throws an error when parsing ${errorFile}`, async () => { await expect(readAll(`../test/data/${errorFile}`)).rejects.toMatch( /inconsistent types/, ) }) }, ) it('can parse a string synchronously', async () => { const gff3 = await readFile( require.resolve('../test/data/spec_eden.gff3'), 'utf8', ) const result = parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, parseComments: true, }) expect(result).toHaveLength(3) const referenceResult = JSON.parse( await readFile( require.resolve('../test/data/spec_eden.result.json'), 'utf8', ), ) expect(result).toEqual(referenceResult) }) it('can parse some whitespace', () => { const gff3 = ` SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . multivalue = val1,val2, val3;testing = blah ` const result = parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, parseComments: true, }) expect(result).toHaveLength(1) const referenceResult = [ [ { seq_id: 'SL2.40%ch01', source: 'IT%AG eugene', type: 'g%e;ne', start: 80999140, end: 81004317, score: null, strand: '+', phase: null, attributes: { multivalue: ['val1', 'val2', 'val3'], testing: ['blah'], }, child_features: [], derived_features: [], }, ], ] expect(result).toEqual(referenceResult) }) it('can parse another string synchronously', () => { const gff3 = ` SL2.40%25ch01 IT%25AG eugene g%25e;ne 80999140 81004317 . + . Alias=Solyc01g098840;ID=gene:Solyc01g098840.2;Name=Solyc01g098840.2;from_BOGAS=1;length=5178 ` const result = parseStringSync(gff3, { parseFeatures: true, parseDirectives: true, parseComments: true, }) expect(result).toHaveLength(1) const referenceResult = [ [ { seq_id: 'SL2.40%ch01', source: 'IT%AG eugene', type: 'g%e;ne', start: 80999140, end: 81004317, score: null, strand: '+', phase: null, attributes: { Alias: ['Solyc01g098840'], ID: ['gene:Solyc01g098840.2'], Name: ['Solyc01g098840.2'], from_BOGAS: ['1'], length: ['5178'], }, child_features: [], derived_features: [], }, ], ] expect(result).toEqual(referenceResult) expect(`\n${formatFeature(referenceResult[0])}`).toEqual(gff3) }) ;( [ [ 'hybrid1.gff3', [ { id: 'A00469', sequence: 'GATTACAGATTACA', }, { id: 'zonker', sequence: 'AAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCCAAAAAACTAGCATGATCGATCGATCGATCGATATTAGCATGCATGCATGATGATGATAGCTATGATCGATCCCCCCC', }, { id: 'zeebo', description: 'this is a test description', sequence: 'AAAAACTAGTAGCTAGCTAGCTGATCATAGATCGATGCATGGCATACTGACTGATCGACCCCCC', }, ], ], [ 'hybrid2.gff3', [ { id: 'A00469', sequence: 'GATTACAWATTACABATTACAGATTACA', }, ], ], ] as const ).forEach(([filename, expectedOutput]) => { it(`can parse FASTA sections in hybrid ${filename} file`, async () => { const stuff = await readAll(`../test/data/${filename}`) expect(stuff.sequences).toEqual(expectedOutput) }) }) })