@gmod/gff
Version:
read and write GFF3 data as streams
437 lines (394 loc) • 12 kB
text/typescript
// Fast, low-level functions for parsing and formatting GFF3.
// JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
/**
* Unescape a string value used in a GFF3 attribute.
*
* @param stringVal - Escaped GFF3 string value
* @returns An unescaped string value
*/
export function unescape(stringVal: string): string {
return decodeURIComponent(stringVal)
}
function _escape(regex: RegExp, s: string | number) {
return String(s).replaceAll(regex, (ch) => {
return encodeURIComponent(ch).toUpperCase()
})
}
/**
* Escape a value for use in a GFF3 attribute value.
*
* @param rawVal - Raw GFF3 attribute value
* @returns An escaped string value
*/
export function escape(rawVal: string | number): string {
return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f]/g, rawVal)
}
/**
* Escape a value for use in a GFF3 column value.
*
* @param rawVal - Raw GFF3 column value
* @returns An escaped column value
*/
export function escapeColumn(rawVal: string | number): string {
return _escape(/[\n\r\t%\u0000-\u001f\u007f]/g, rawVal)
}
/**
* Parse the 9th column (attributes) of a GFF3 feature line.
*
* @param attrString - String of GFF3 9th column
* @returns Parsed attributes
*/
export function parseAttributes(attrString: string): GFF3Attributes {
if (!attrString?.length || attrString === '.') {
return {}
}
const attrs: GFF3Attributes = {}
attrString
.replace(/\r\n|[\r\n]$/, '')
.split(';')
.forEach((a) => {
const nv = a.split('=', 2)
if (!nv[1]?.length) {
return
}
nv[0] = nv[0].trim()
let arec = attrs[nv[0].trim()]
if (!arec) {
arec = []
attrs[nv[0]] = arec
}
arec.push(
...nv[1]
.split(',')
.map((s) => s.trim())
.map(unescape),
)
})
return attrs
}
/**
* Parse a GFF3 feature line
*
* @param line - GFF3 feature line
* @returns The parsed feature
*/
export function parseFeature(line: string): GFF3FeatureLine {
// split the line into columns and replace '.' with null in each column
const f = line
.trim()
.split('\t')
.map((a) => (a === '.' || a === '' ? null : a))
// unescape only the ref, source, and type columns
const parsed: GFF3FeatureLine = {
seq_id: f[0] && unescape(f[0]),
source: f[1] && unescape(f[1]),
type: f[2] && unescape(f[2]),
start: f[3] === null ? null : parseInt(f[3], 10),
end: f[4] === null ? null : parseInt(f[4], 10),
score: f[5] === null ? null : parseFloat(f[5]),
strand: f[6],
phase: f[7],
attributes: f[8] === null ? null : parseAttributes(f[8]),
}
return parsed
}
/**
* Parse a GFF3 directive line.
*
* @param line - GFF3 directive line
* @returns The parsed directive
*/
export function parseDirective(
line: string,
):
| GFF3Directive
| GFF3SequenceRegionDirective
| GFF3GenomeBuildDirective
| null {
const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line)
if (!match) {
return null
}
const [, name] = match
let [, , contents] = match
const parsed: GFF3Directive = { directive: name }
if (contents.length) {
contents = contents.replace(/\r\n|[\r\n]$/, '')
parsed.value = contents
}
// do a little additional parsing for sequence-region and genome-build directives
if (name === 'sequence-region') {
const c = contents.split(/\s+/, 3)
return {
...parsed,
seq_id: c[0],
start: c[1]?.replaceAll(/\D/g, ''),
end: c[2]?.replaceAll(/\D/g, ''),
} as GFF3SequenceRegionDirective
} else if (name === 'genome-build') {
const [source, buildName] = contents.split(/\s+/, 2)
return {
...parsed,
source,
buildName,
} as GFF3GenomeBuildDirective
}
return parsed
}
/**
* Format an attributes object into a string suitable for the 9th column of GFF3.
*
* @param attrs - Attributes
* @returns GFF3 9th column string
*/
export function formatAttributes(attrs: GFF3Attributes): string {
const attrOrder: string[] = []
Object.entries(attrs).forEach(([tag, val]) => {
const valstring = val.map(escape).join(',')
attrOrder.push(`${escape(tag)}=${valstring}`)
})
return attrOrder.length ? attrOrder.join(';') : '.'
}
function _formatSingleFeature(
f: GFF3FeatureLine | GFF3FeatureLineWithRefs,
seenFeature: Record<string, boolean | undefined>,
) {
const attrString =
f.attributes === null || f.attributes === undefined
? '.'
: formatAttributes(f.attributes)
const fields = [
f.seq_id === null ? '.' : escapeColumn(f.seq_id),
f.source === null ? '.' : escapeColumn(f.source),
f.type === null ? '.' : escapeColumn(f.type),
f.start === null ? '.' : escapeColumn(f.start),
f.end === null ? '.' : escapeColumn(f.end),
f.score === null ? '.' : escapeColumn(f.score),
f.strand === null ? '.' : escapeColumn(f.strand),
f.phase === null ? '.' : escapeColumn(f.phase),
attrString,
]
const formattedString = `${fields.join('\t')}\n`
// if we have already output this exact feature, skip it
if (seenFeature[formattedString]) {
return ''
}
seenFeature[formattedString] = true
return formattedString
}
function _formatFeature(
feature:
| GFF3FeatureLine
| GFF3FeatureLineWithRefs
| (GFF3FeatureLine | GFF3FeatureLineWithRefs)[],
seenFeature: Record<string, boolean | undefined>,
): string {
if (Array.isArray(feature)) {
return feature.map((f) => _formatFeature(f, seenFeature)).join('')
}
const strings = [_formatSingleFeature(feature, seenFeature)]
if (_isFeatureLineWithRefs(feature)) {
strings.push(
...feature.child_features.map((f) => _formatFeature(f, seenFeature)),
...feature.derived_features.map((f) => _formatFeature(f, seenFeature)),
)
}
return strings.join('')
}
/**
* Format a feature object or array of feature objects into one or more lines of
* GFF3.
*
* @param featureOrFeatures - A feature object or array of feature objects
* @returns A string of one or more GFF3 lines
*/
export function formatFeature(
featureOrFeatures:
| GFF3FeatureLine
| GFF3FeatureLineWithRefs
| (GFF3FeatureLine | GFF3FeatureLineWithRefs)[],
): string {
const seen = {}
return _formatFeature(featureOrFeatures, seen)
}
/**
* Format a directive into a line of GFF3.
*
* @param directive - A directive object
* @returns A directive line string
*/
export function formatDirective(directive: GFF3Directive): string {
let str = `##${directive.directive}`
if (directive.value) {
str += ` ${directive.value}`
}
str += '\n'
return str
}
/**
* Format a comment into a GFF3 comment.
* Yes I know this is just adding a # and a newline.
*
* @param comment - A comment object
* @returns A comment line string
*/
export function formatComment(comment: GFF3Comment): string {
return `# ${comment.comment}\n`
}
/**
* Format a sequence object as FASTA
*
* @param seq - A sequence object
* @returns Formatted single FASTA sequence string
*/
export function formatSequence(seq: GFF3Sequence): string {
const header = `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n`
// split sequence chunks into lines of length 80 for embedded FASTA
const lineLength = 80
const numChunks = Math.ceil(seq.sequence.length / lineLength)
const chunks = new Array(numChunks)
for (let i = 0; i < numChunks; i += 1) {
const start = i * lineLength
chunks[i] = seq.sequence.slice(start, start + lineLength)
}
return `${header}${chunks.join('\n')}\n`
}
function formatSingleItem(
item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence,
) {
if ('attributes' in item) {
return formatFeature(item)
}
if ('directive' in item) {
return formatDirective(item)
}
if ('sequence' in item) {
return formatSequence(item)
}
if ('comment' in item) {
return formatComment(item)
}
return '# (invalid item found during format)\n'
}
/**
* Format a directive, comment, sequence, or feature, or array of such items,
* into one or more lines of GFF3.
*
* @param itemOrItems - A comment, sequence, or feature, or array of such items
* @returns A formatted string or array of strings
*/
export function formatItem(
item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence,
): string
export function formatItem(
items: (
| GFF3FeatureLineWithRefs
| GFF3Directive
| GFF3Comment
| GFF3Sequence
)[],
): string[]
export function formatItem(
itemOrItems:
| GFF3FeatureLineWithRefs
| GFF3Directive
| GFF3Comment
| GFF3Sequence
| (GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence)[],
) {
if (Array.isArray(itemOrItems)) {
return itemOrItems.map(formatSingleItem)
}
return formatSingleItem(itemOrItems)
}
/** A record of GFF3 attribute identifiers and the values of those identifiers */
export type GFF3Attributes = Record<string, string[]>
/** A representation of a single line of a GFF3 file */
export interface GFF3FeatureLine {
/** The ID of the landmark used to establish the coordinate system for the current feature */
seq_id: string | null
/** A free text qualifier intended to describe the algorithm or operating procedure that generated this feature */
source: string | null
/** The type of the feature */
type: string | null
/** The start coordinates of the feature */
start: number | null
/** The end coordinates of the feature */
end: number | null
/** The score of the feature */
score: number | null
/** The strand of the feature */
strand: string | null
/** For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end of the current CDS feature */
phase: string | null
/** Feature attributes */
attributes: GFF3Attributes | null
}
/**
* A GFF3 Feature line that includes references to other features defined in
* their "Parent" or "Derives_from" attributes
*/
export interface GFF3FeatureLineWithRefs extends GFF3FeatureLine {
/** An array of child features */
child_features: GFF3Feature[]
/** An array of features derived from this feature */
derived_features: GFF3Feature[]
}
function _isFeatureLineWithRefs(
featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs,
): featureLine is GFF3FeatureLineWithRefs {
return (
(featureLine as GFF3FeatureLineWithRefs).child_features !== undefined &&
(featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined
)
}
/**
* A GFF3 feature, which may include multiple individual feature lines
*/
export type GFF3Feature = GFF3FeatureLineWithRefs[]
/** A GFF3 directive */
export interface BaseGFF3Directive {
/** The name of the directive */
directive: string
/** The string value of the directive */
value?: string
}
/** A GFF3 sequence-region directive */
export interface GFF3SequenceRegionDirective extends BaseGFF3Directive {
/** The string value of the directive */
value: string
/** The sequence ID parsed from the directive */
seq_id: string
/** The sequence start parsed from the directive */
start: string
/** The sequence end parsed from the directive */
end: string
}
/** A GFF3 genome-build directive */
export interface GFF3GenomeBuildDirective extends BaseGFF3Directive {
/** The string value of the directive */
value: string
/** The genome build source parsed from the directive */
source: string
/** The genome build name parsed from the directive */
buildName: string
}
export type GFF3Directive =
| BaseGFF3Directive
| GFF3SequenceRegionDirective
| GFF3GenomeBuildDirective
/** A GFF3 comment */
export interface GFF3Comment {
/** The text of the comment */
comment: string
}
/** A GFF3 FASTA single sequence */
export interface GFF3Sequence {
/** The ID of the sequence */
id: string
/** The description of the sequence */
description?: string
/** The sequence */
sequence: string
}
export type GFF3Item = GFF3Feature | GFF3Directive | GFF3Comment | GFF3Sequence