UNPKG

@gmod/gff

Version:

read and write GFF3 data as streams

437 lines (394 loc) 12 kB
// Fast, low-level functions for parsing and formatting GFF3. // JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module. /** * Unescape a string value used in a GFF3 attribute. * * @param stringVal - Escaped GFF3 string value * @returns An unescaped string value */ export function unescape(stringVal: string): string { return decodeURIComponent(stringVal) } function _escape(regex: RegExp, s: string | number) { return String(s).replaceAll(regex, (ch) => { return encodeURIComponent(ch).toUpperCase() }) } /** * Escape a value for use in a GFF3 attribute value. * * @param rawVal - Raw GFF3 attribute value * @returns An escaped string value */ export function escape(rawVal: string | number): string { return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f]/g, rawVal) } /** * Escape a value for use in a GFF3 column value. * * @param rawVal - Raw GFF3 column value * @returns An escaped column value */ export function escapeColumn(rawVal: string | number): string { return _escape(/[\n\r\t%\u0000-\u001f\u007f]/g, rawVal) } /** * Parse the 9th column (attributes) of a GFF3 feature line. * * @param attrString - String of GFF3 9th column * @returns Parsed attributes */ export function parseAttributes(attrString: string): GFF3Attributes { if (!attrString?.length || attrString === '.') { return {} } const attrs: GFF3Attributes = {} attrString .replace(/\r\n|[\r\n]$/, '') .split(';') .forEach((a) => { const nv = a.split('=', 2) if (!nv[1]?.length) { return } nv[0] = nv[0].trim() let arec = attrs[nv[0].trim()] if (!arec) { arec = [] attrs[nv[0]] = arec } arec.push( ...nv[1] .split(',') .map((s) => s.trim()) .map(unescape), ) }) return attrs } /** * Parse a GFF3 feature line * * @param line - GFF3 feature line * @returns The parsed feature */ export function parseFeature(line: string): GFF3FeatureLine { // split the line into columns and replace '.' with null in each column const f = line .trim() .split('\t') .map((a) => (a === '.' || a === '' ? null : a)) // unescape only the ref, source, and type columns const parsed: GFF3FeatureLine = { seq_id: f[0] && unescape(f[0]), source: f[1] && unescape(f[1]), type: f[2] && unescape(f[2]), start: f[3] === null ? null : parseInt(f[3], 10), end: f[4] === null ? null : parseInt(f[4], 10), score: f[5] === null ? null : parseFloat(f[5]), strand: f[6], phase: f[7], attributes: f[8] === null ? null : parseAttributes(f[8]), } return parsed } /** * Parse a GFF3 directive line. * * @param line - GFF3 directive line * @returns The parsed directive */ export function parseDirective( line: string, ): | GFF3Directive | GFF3SequenceRegionDirective | GFF3GenomeBuildDirective | null { const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line) if (!match) { return null } const [, name] = match let [, , contents] = match const parsed: GFF3Directive = { directive: name } if (contents.length) { contents = contents.replace(/\r\n|[\r\n]$/, '') parsed.value = contents } // do a little additional parsing for sequence-region and genome-build directives if (name === 'sequence-region') { const c = contents.split(/\s+/, 3) return { ...parsed, seq_id: c[0], start: c[1]?.replaceAll(/\D/g, ''), end: c[2]?.replaceAll(/\D/g, ''), } as GFF3SequenceRegionDirective } else if (name === 'genome-build') { const [source, buildName] = contents.split(/\s+/, 2) return { ...parsed, source, buildName, } as GFF3GenomeBuildDirective } return parsed } /** * Format an attributes object into a string suitable for the 9th column of GFF3. * * @param attrs - Attributes * @returns GFF3 9th column string */ export function formatAttributes(attrs: GFF3Attributes): string { const attrOrder: string[] = [] Object.entries(attrs).forEach(([tag, val]) => { const valstring = val.map(escape).join(',') attrOrder.push(`${escape(tag)}=${valstring}`) }) return attrOrder.length ? attrOrder.join(';') : '.' } function _formatSingleFeature( f: GFF3FeatureLine | GFF3FeatureLineWithRefs, seenFeature: Record<string, boolean | undefined>, ) { const attrString = f.attributes === null || f.attributes === undefined ? '.' : formatAttributes(f.attributes) const fields = [ f.seq_id === null ? '.' : escapeColumn(f.seq_id), f.source === null ? '.' : escapeColumn(f.source), f.type === null ? '.' : escapeColumn(f.type), f.start === null ? '.' : escapeColumn(f.start), f.end === null ? '.' : escapeColumn(f.end), f.score === null ? '.' : escapeColumn(f.score), f.strand === null ? '.' : escapeColumn(f.strand), f.phase === null ? '.' : escapeColumn(f.phase), attrString, ] const formattedString = `${fields.join('\t')}\n` // if we have already output this exact feature, skip it if (seenFeature[formattedString]) { return '' } seenFeature[formattedString] = true return formattedString } function _formatFeature( feature: | GFF3FeatureLine | GFF3FeatureLineWithRefs | (GFF3FeatureLine | GFF3FeatureLineWithRefs)[], seenFeature: Record<string, boolean | undefined>, ): string { if (Array.isArray(feature)) { return feature.map((f) => _formatFeature(f, seenFeature)).join('') } const strings = [_formatSingleFeature(feature, seenFeature)] if (_isFeatureLineWithRefs(feature)) { strings.push( ...feature.child_features.map((f) => _formatFeature(f, seenFeature)), ...feature.derived_features.map((f) => _formatFeature(f, seenFeature)), ) } return strings.join('') } /** * Format a feature object or array of feature objects into one or more lines of * GFF3. * * @param featureOrFeatures - A feature object or array of feature objects * @returns A string of one or more GFF3 lines */ export function formatFeature( featureOrFeatures: | GFF3FeatureLine | GFF3FeatureLineWithRefs | (GFF3FeatureLine | GFF3FeatureLineWithRefs)[], ): string { const seen = {} return _formatFeature(featureOrFeatures, seen) } /** * Format a directive into a line of GFF3. * * @param directive - A directive object * @returns A directive line string */ export function formatDirective(directive: GFF3Directive): string { let str = `##${directive.directive}` if (directive.value) { str += ` ${directive.value}` } str += '\n' return str } /** * Format a comment into a GFF3 comment. * Yes I know this is just adding a # and a newline. * * @param comment - A comment object * @returns A comment line string */ export function formatComment(comment: GFF3Comment): string { return `# ${comment.comment}\n` } /** * Format a sequence object as FASTA * * @param seq - A sequence object * @returns Formatted single FASTA sequence string */ export function formatSequence(seq: GFF3Sequence): string { const header = `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n` // split sequence chunks into lines of length 80 for embedded FASTA const lineLength = 80 const numChunks = Math.ceil(seq.sequence.length / lineLength) const chunks = new Array(numChunks) for (let i = 0; i < numChunks; i += 1) { const start = i * lineLength chunks[i] = seq.sequence.slice(start, start + lineLength) } return `${header}${chunks.join('\n')}\n` } function formatSingleItem( item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence, ) { if ('attributes' in item) { return formatFeature(item) } if ('directive' in item) { return formatDirective(item) } if ('sequence' in item) { return formatSequence(item) } if ('comment' in item) { return formatComment(item) } return '# (invalid item found during format)\n' } /** * Format a directive, comment, sequence, or feature, or array of such items, * into one or more lines of GFF3. * * @param itemOrItems - A comment, sequence, or feature, or array of such items * @returns A formatted string or array of strings */ export function formatItem( item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence, ): string export function formatItem( items: ( | GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence )[], ): string[] export function formatItem( itemOrItems: | GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence | (GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence)[], ) { if (Array.isArray(itemOrItems)) { return itemOrItems.map(formatSingleItem) } return formatSingleItem(itemOrItems) } /** A record of GFF3 attribute identifiers and the values of those identifiers */ export type GFF3Attributes = Record<string, string[]> /** A representation of a single line of a GFF3 file */ export interface GFF3FeatureLine { /** The ID of the landmark used to establish the coordinate system for the current feature */ seq_id: string | null /** A free text qualifier intended to describe the algorithm or operating procedure that generated this feature */ source: string | null /** The type of the feature */ type: string | null /** The start coordinates of the feature */ start: number | null /** The end coordinates of the feature */ end: number | null /** The score of the feature */ score: number | null /** The strand of the feature */ strand: string | null /** For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end of the current CDS feature */ phase: string | null /** Feature attributes */ attributes: GFF3Attributes | null } /** * A GFF3 Feature line that includes references to other features defined in * their "Parent" or "Derives_from" attributes */ export interface GFF3FeatureLineWithRefs extends GFF3FeatureLine { /** An array of child features */ child_features: GFF3Feature[] /** An array of features derived from this feature */ derived_features: GFF3Feature[] } function _isFeatureLineWithRefs( featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs, ): featureLine is GFF3FeatureLineWithRefs { return ( (featureLine as GFF3FeatureLineWithRefs).child_features !== undefined && (featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined ) } /** * A GFF3 feature, which may include multiple individual feature lines */ export type GFF3Feature = GFF3FeatureLineWithRefs[] /** A GFF3 directive */ export interface BaseGFF3Directive { /** The name of the directive */ directive: string /** The string value of the directive */ value?: string } /** A GFF3 sequence-region directive */ export interface GFF3SequenceRegionDirective extends BaseGFF3Directive { /** The string value of the directive */ value: string /** The sequence ID parsed from the directive */ seq_id: string /** The sequence start parsed from the directive */ start: string /** The sequence end parsed from the directive */ end: string } /** A GFF3 genome-build directive */ export interface GFF3GenomeBuildDirective extends BaseGFF3Directive { /** The string value of the directive */ value: string /** The genome build source parsed from the directive */ source: string /** The genome build name parsed from the directive */ buildName: string } export type GFF3Directive = | BaseGFF3Directive | GFF3SequenceRegionDirective | GFF3GenomeBuildDirective /** A GFF3 comment */ export interface GFF3Comment { /** The text of the comment */ comment: string } /** A GFF3 FASTA single sequence */ export interface GFF3Sequence { /** The ID of the sequence */ id: string /** The description of the sequence */ description?: string /** The sequence */ sequence: string } export type GFF3Item = GFF3Feature | GFF3Directive | GFF3Comment | GFF3Sequence