@gmod/bam
Version:
Parser for BAM and BAM index (bai) files
269 lines (244 loc) • 7.19 kB
text/typescript
import Chunk from './chunk.ts'
import { longFromBytesToUnsigned } from './long.ts'
import type { Offset, VirtualOffset } from './virtualOffset.ts'
export interface TagFilter {
tag: string
value?: string
}
export interface FilterBy {
flagInclude?: number
flagExclude?: number
tagFilter?: TagFilter
}
export interface BamOpts {
viewAsPairs?: boolean
pairAcrossChr?: boolean
maxInsertSize?: number
signal?: AbortSignal
filterBy?: FilterBy
}
export interface BaseOpts {
signal?: AbortSignal
}
export function optimizeChunks(chunks: Chunk[], lowest?: Offset) {
const n = chunks.length
if (n === 0) {
return chunks
}
// Pre-filter chunks below lowest threshold before sorting
let filtered: Chunk[]
if (lowest) {
const lowestBlock = lowest.blockPosition
const lowestData = lowest.dataPosition
filtered = []
for (let i = 0; i < n; i++) {
const chunk = chunks[i]!
const maxv = chunk.maxv
const cmp =
maxv.blockPosition - lowestBlock || maxv.dataPosition - lowestData
if (cmp > 0) {
filtered.push(chunk)
}
}
if (filtered.length === 0) {
return filtered
}
} else {
filtered = chunks
}
filtered.sort((c0, c1) => {
const dif = c0.minv.blockPosition - c1.minv.blockPosition
return dif !== 0 ? dif : c0.minv.dataPosition - c1.minv.dataPosition
})
// Source chunks are shared with the index's per-refId cache, so we never
// mutate them — extending a merged span produces a new Chunk instance.
const mergedChunks: Chunk[] = [filtered[0]!]
let lastMinBlock = filtered[0]!.minv.blockPosition
let lastMaxBlock = filtered[0]!.maxv.blockPosition
for (let i = 1; i < filtered.length; i++) {
const chunk = filtered[i]!
const chunkMinBlock = chunk.minv.blockPosition
const chunkMaxBlock = chunk.maxv.blockPosition
// Merge if chunks are close enough: small gap between them, and the
// combined span is bounded so we don't grow a single chunk indefinitely.
if (
chunkMinBlock - lastMaxBlock < 65000 &&
chunkMaxBlock - lastMinBlock < 5000000
) {
const lastChunk = mergedChunks[mergedChunks.length - 1]!
const cmp =
chunkMaxBlock - lastMaxBlock ||
chunk.maxv.dataPosition - lastChunk.maxv.dataPosition
if (cmp > 0) {
mergedChunks[mergedChunks.length - 1] = new Chunk(
lastChunk.minv,
chunk.maxv,
lastChunk.bin,
)
lastMaxBlock = chunkMaxBlock
}
} else {
mergedChunks.push(chunk)
lastMinBlock = chunkMinBlock
lastMaxBlock = chunkMaxBlock
}
}
return mergedChunks
}
export function parsePseudoBin(bytes: Uint8Array, offset: number) {
return {
lineCount: longFromBytesToUnsigned(bytes, offset),
}
}
// Parse the BAM reference-sequence table (SAMv1.pdf §4.2). Returns undefined
// if `uncba` doesn't yet contain the full table — caller fetches more bytes
// and retries.
export function parseRefSeqs(
uncba: Uint8Array,
start: number,
renameRefSeq: (s: string) => string,
) {
if (start + 4 > uncba.length) {
return undefined
}
const dataView = new DataView(uncba.buffer)
const nRef = dataView.getInt32(start, true)
const chrToIndex: Record<string, number> = {}
const indexToChr: { refName: string; length: number }[] = []
const decoder = new TextDecoder('utf8')
let p = start + 4
for (let i = 0; i < nRef; i++) {
if (p + 8 > uncba.length) {
return undefined
}
const lName = dataView.getInt32(p, true)
if (p + 8 + lName > uncba.length) {
return undefined
}
const refName = renameRefSeq(
decoder.decode(uncba.subarray(p + 4, p + 4 + lName - 1)),
)
const lRef = dataView.getInt32(p + lName + 4, true)
chrToIndex[refName] = i
indexToChr.push({ refName, length: lRef })
p += 8 + lName
}
return { chrToIndex, indexToChr }
}
export function findFirstData(
firstDataLine: VirtualOffset | undefined,
virtualOffset: VirtualOffset,
) {
return firstDataLine
? firstDataLine.compareTo(virtualOffset) > 0
? virtualOffset
: firstDataLine
: virtualOffset
}
// SYNC: ~/src/gmod/tabix-js/src/util.ts parseNameBytes uses indexOf(0) instead of byte scan
export function parseNameBytes(
namesBytes: Uint8Array,
renameRefSeq: (arg: string) => string = s => s,
) {
const decoder = new TextDecoder()
let currRefId = 0
let currNameStart = 0
const refIdToName: string[] = []
const refNameToId: Record<string, number> = {}
for (let i = 0; i < namesBytes.length; i++) {
if (!namesBytes[i]) {
if (currNameStart < i) {
const refName = renameRefSeq(
decoder.decode(namesBytes.subarray(currNameStart, i)),
)
refIdToName[currRefId] = refName
refNameToId[refName] = currRefId
}
currNameStart = i + 1
currRefId++
}
}
return { refNameToId, refIdToName }
}
export function concatUint8Array(args: Uint8Array[]) {
let totalLength = 0
for (const entry of args) {
totalLength += entry.length
}
const mergedArray = new Uint8Array(totalLength)
let offset = 0
for (const entry of args) {
mergedArray.set(entry, offset)
offset += entry.length
}
return mergedArray
}
export function filterReadFlag(
flags: number,
flagInclude: number,
flagExclude: number,
) {
return (flags & flagInclude) !== flagInclude || (flags & flagExclude) !== 0
}
export function filterTagValue(readVal: unknown, filterVal?: string) {
return filterVal === '*'
? readVal === undefined
: `${readVal}` !== `${filterVal}`
}
export function filterCacheKey(filterBy?: FilterBy) {
if (!filterBy) {
return ''
}
const { flagInclude = 0, flagExclude = 0, tagFilter } = filterBy
const tagPart = tagFilter ? `:${tagFilter.tag}=${tagFilter.value ?? '*'}` : ''
return `:f${flagInclude}x${flagExclude}${tagPart}`
}
interface Filterable {
flags: number
tags: Record<string, unknown>
}
// Apply flagInclude/flagExclude/tagFilter to a list of records.
export function applyFilters<T extends Filterable>(
records: T[],
filterBy: FilterBy,
): T[] {
const { flagInclude = 0, flagExclude = 0, tagFilter } = filterBy
const out: T[] = []
for (let i = 0, l = records.length; i < l; i++) {
const r = records[i]!
if (
!filterReadFlag(r.flags, flagInclude, flagExclude) &&
!(tagFilter && filterTagValue(r.tags[tagFilter.tag], tagFilter.value))
) {
out.push(r)
}
}
return out
}
interface Positioned {
ref_id: number
start: number
end: number
}
// Append records overlapping [min, max) on `chrId` into `out` (or a fresh
// array if omitted). Records are assumed sorted by start, so we stop scanning
// at the first record past `max`. Returns the populated array.
export function appendInRange<T extends Positioned>(
records: T[],
chrId: number,
min: number,
max: number,
out: T[] = [],
): T[] {
for (let i = 0, l = records.length; i < l; i++) {
const r = records[i]!
if (r.ref_id === chrId) {
if (r.start >= max) {
break
} else if (r.end >= min) {
out.push(r)
}
}
}
return out
}