apr144-bam
Version:
Parser for BAM and BAM index (bai) files
1,124 lines (1,076 loc) • 35.1 kB
text/typescript
import { Buffer } from 'buffer'
import crc32 from 'buffer-crc32'
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle'
import AbortablePromiseCache from 'abortable-promise-cache'
import QuickLRU from 'quick-lru'
// locals
import BAI from './bai'
import CSI from './csi'
import Chunk from './chunk'
import BAMFeature from './record'
import { parseHeaderText } from './sam'
import { checkAbortSignal, timeout, makeOpts, BamOpts, BaseOpts } from './util'
export const BAM_MAGIC = 21840194
const blockLen = 1 << 16
async function gen2array<T>(gen: AsyncIterable<T[]>): Promise<T[]> {
let out: T[] = []
for await (const x of gen) {
out = out.concat(x)
}
return out
}
interface Args {
chunk: Chunk
opts: BaseOpts
}
class NullFilehandle {
public read(): Promise<any> {
throw new Error('never called')
}
public stat(): Promise<any> {
throw new Error('never called')
}
public readFile(): Promise<any> {
throw new Error('never called')
}
public close(): Promise<any> {
throw new Error('never called')
}
}
export default class BamFile {
public renameRefSeq: (a: string) => string
public bam: GenericFilehandle
public header?: string
public chrToIndex?: Record<string, number>
public indexToChr?: { refName: string; length: number }[]
public yieldThreadTime: number
public index?: BAI | CSI
public htsget = false
public headerP?: ReturnType<BamFile['getHeaderPre']>
private featureCache = new AbortablePromiseCache<Args, BAMFeature[]>({
cache: new QuickLRU({
maxSize: 50,
}),
fill: async (args: Args, signal) => {
const { chunk, opts } = args
const { data, cpositions, dpositions } = await this._readChunk({
chunk,
opts: { ...opts, signal },
})
return this.readBamFeatures(data, cpositions, dpositions, chunk)
},
})
constructor({
bamFilehandle,
bamPath,
bamUrl,
baiPath,
baiFilehandle,
baiUrl,
csiPath,
csiFilehandle,
csiUrl,
htsget,
yieldThreadTime = 100,
renameRefSeqs = n => n,
}: {
bamFilehandle?: GenericFilehandle
bamPath?: string
bamUrl?: string
baiPath?: string
baiFilehandle?: GenericFilehandle
baiUrl?: string
csiPath?: string
csiFilehandle?: GenericFilehandle
csiUrl?: string
renameRefSeqs?: (a: string) => string
yieldThreadTime?: number
htsget?: boolean
}) {
this.renameRefSeq = renameRefSeqs
if (bamFilehandle) {
this.bam = bamFilehandle
} else if (bamPath) {
this.bam = new LocalFile(bamPath)
} else if (bamUrl) {
const bamUrlObj = new URL(bamUrl)
const bamUrlUsername = bamUrlObj.username
const bamUrlPassword = bamUrlObj.password
if (bamUrlUsername && bamUrlPassword) {
bamUrl = `${bamUrlObj.protocol}//${bamUrlObj.host}${bamUrlObj.pathname}${bamUrlObj.search}`
this.bam = new RemoteFile(bamUrl, {
overrides: {
credentials: 'include',
headers: {
Authorization:
'Basic ' + btoa(bamUrlUsername + ':' + bamUrlPassword),
},
},
})
} else {
this.bam = new RemoteFile(bamUrl)
}
} else if (htsget) {
this.htsget = true
this.bam = new NullFilehandle()
} else {
throw new Error('unable to initialize bam')
}
if (csiFilehandle) {
this.index = new CSI({ filehandle: csiFilehandle })
} else if (csiPath) {
this.index = new CSI({ filehandle: new LocalFile(csiPath) })
} else if (csiUrl) {
this.index = new CSI({ filehandle: new RemoteFile(csiUrl) })
} else if (baiFilehandle) {
this.index = new BAI({ filehandle: baiFilehandle })
} else if (baiPath) {
this.index = new BAI({ filehandle: new LocalFile(baiPath) })
} else if (baiUrl) {
const baiUrlObj = new URL(baiUrl)
const baiUrlUsername = baiUrlObj.username
const baiUrlPassword = baiUrlObj.password
if (baiUrlUsername && baiUrlPassword) {
baiUrl = `${baiUrlObj.protocol}//${baiUrlObj.host}${baiUrlObj.pathname}${baiUrlObj.search}`
// console.log(
// `baiUrl | ${baiUrl} | ${baiUrlUsername} | ${baiUrlPassword}`,
// )
this.index = new BAI({
filehandle: new RemoteFile(baiUrl, {
overrides: {
credentials: 'include',
headers: {
Authorization:
'Basic ' + btoa(baiUrlUsername + ':' + baiUrlPassword),
},
},
}),
})
} else {
this.index = new BAI({ filehandle: new RemoteFile(baiUrl) })
}
} else if (bamPath) {
this.index = new BAI({ filehandle: new LocalFile(`${bamPath}.bai`) })
} else if (bamUrl) {
const baiUrlObj = new URL(bamUrl)
const baiUrlUsername = baiUrlObj.username
const baiUrlPassword = baiUrlObj.password
if (baiUrlUsername && baiUrlPassword) {
const baiUrl = `${baiUrlObj.protocol}//${baiUrlObj.host}${baiUrlObj.pathname}.bai${baiUrlObj.search}`
// console.log(
// `baiUrl | ${baiUrl} | ${baiUrlUsername} | ${baiUrlPassword}`,
// )
this.index = new BAI({
filehandle: new RemoteFile(baiUrl, {
overrides: {
credentials: 'include',
headers: {
Authorization:
'Basic ' + btoa(baiUrlUsername + ':' + baiUrlPassword),
},
},
}),
})
} else {
this.index = new BAI({ filehandle: new RemoteFile(`${bamUrl}.bai`) })
}
} else if (htsget) {
this.htsget = true
} else {
throw new Error('unable to infer index format')
}
this.yieldThreadTime = yieldThreadTime
}
async getHeaderPre(origOpts?: BaseOpts) {
const opts = makeOpts(origOpts)
// console.log(`[bam-js] getHeaderPre: ${JSON.stringify(opts)}`)
// if (opts.assemblyName && opts.assemblyName === 'hg38') {
// this.chrToIndex = {
// chr1: 0,
// chr10: 1,
// chr11: 2,
// chr12: 3,
// chr13: 4,
// chr14: 5,
// chr15: 6,
// chr16: 7,
// chr17: 8,
// chr18: 9,
// chr19: 10,
// chr2: 11,
// chr20: 12,
// chr21: 13,
// chr22: 14,
// chr3: 15,
// chr4: 16,
// chr5: 17,
// chr6: 18,
// chr7: 19,
// chr8: 20,
// chr9: 21,
// chrM: 22,
// chrX: 23,
// chrY: 24,
// }
// this.indexToChr = [
// {
// refName: 'chr1',
// length: 248956422,
// },
// {
// refName: 'chr10',
// length: 133797422,
// },
// {
// refName: 'chr11',
// length: 135086622,
// },
// {
// refName: 'chr12',
// length: 133275309,
// },
// {
// refName: 'chr13',
// length: 114364328,
// },
// {
// refName: 'chr14',
// length: 107043718,
// },
// {
// refName: 'chr15',
// length: 101991189,
// },
// {
// refName: 'chr16',
// length: 90338345,
// },
// {
// refName: 'chr17',
// length: 83257441,
// },
// {
// refName: 'chr18',
// length: 80373285,
// },
// {
// refName: 'chr19',
// length: 58617616,
// },
// {
// refName: 'chr2',
// length: 242193529,
// },
// {
// refName: 'chr20',
// length: 64444167,
// },
// {
// refName: 'chr21',
// length: 46709983,
// },
// {
// refName: 'chr22',
// length: 50818468,
// },
// {
// refName: 'chr3',
// length: 198295559,
// },
// {
// refName: 'chr4',
// length: 190214555,
// },
// {
// refName: 'chr5',
// length: 181538259,
// },
// {
// refName: 'chr6',
// length: 170805979,
// },
// {
// refName: 'chr7',
// length: 159345973,
// },
// {
// refName: 'chr8',
// length: 145138636,
// },
// {
// refName: 'chr9',
// length: 138394717,
// },
// {
// refName: 'chrM',
// length: 16569,
// },
// {
// refName: 'chrX',
// length: 156040895,
// },
// {
// refName: 'chrY',
// length: 57227415,
// },
// ]
// return
// }
if (opts.assemblyName && opts.assemblyName === 'hg38') {
this.chrToIndex = {
chr1: 0,
chr2: 1,
chr3: 2,
chr4: 3,
chr5: 4,
chr6: 5,
chr7: 6,
chr8: 7,
chr9: 8,
chr10: 9,
chr11: 10,
chr12: 11,
chr13: 12,
chr14: 13,
chr15: 14,
chr16: 15,
chr17: 16,
chr18: 17,
chr19: 18,
chr20: 19,
chr21: 20,
chr22: 21,
chrX: 22,
chrY: 23,
chrM: 24,
'GL000008.2': 25,
'GL000009.2': 26,
'GL000194.1': 27,
'GL000195.1': 28,
'GL000205.2': 29,
'GL000208.1': 30,
'GL000213.1': 31,
'GL000214.1': 32,
'GL000216.2': 33,
'GL000218.1': 34,
'GL000219.1': 35,
'GL000220.1': 36,
'GL000221.1': 37,
'GL000224.1': 38,
'GL000225.1': 39,
'GL000226.1': 40,
'KI270302.1': 41,
'KI270303.1': 42,
'KI270304.1': 43,
'KI270305.1': 44,
'KI270310.1': 45,
'KI270311.1': 46,
'KI270312.1': 47,
'KI270315.1': 48,
'KI270316.1': 49,
'KI270317.1': 50,
'KI270320.1': 51,
'KI270322.1': 52,
'KI270329.1': 53,
'KI270330.1': 54,
'KI270333.1': 55,
'KI270334.1': 56,
'KI270335.1': 57,
'KI270336.1': 58,
'KI270337.1': 59,
'KI270338.1': 60,
'KI270340.1': 61,
'KI270362.1': 62,
'KI270363.1': 63,
'KI270364.1': 64,
'KI270366.1': 65,
'KI270371.1': 66,
'KI270372.1': 67,
'KI270373.1': 68,
'KI270374.1': 69,
'KI270375.1': 70,
'KI270376.1': 71,
'KI270378.1': 72,
'KI270379.1': 73,
'KI270381.1': 74,
'KI270382.1': 75,
'KI270383.1': 76,
'KI270384.1': 77,
'KI270385.1': 78,
'KI270386.1': 79,
'KI270387.1': 80,
'KI270388.1': 81,
'KI270389.1': 82,
'KI270390.1': 83,
'KI270391.1': 84,
'KI270392.1': 85,
'KI270393.1': 86,
'KI270394.1': 87,
'KI270395.1': 88,
'KI270396.1': 89,
'KI270411.1': 90,
'KI270412.1': 91,
'KI270414.1': 92,
'KI270417.1': 93,
'KI270418.1': 94,
'KI270419.1': 95,
'KI270420.1': 96,
'KI270422.1': 97,
'KI270423.1': 98,
'KI270424.1': 99,
'KI270425.1': 100,
'KI270429.1': 101,
'KI270435.1': 102,
'KI270438.1': 103,
'KI270442.1': 104,
'KI270448.1': 105,
'KI270465.1': 106,
'KI270466.1': 107,
'KI270467.1': 108,
'KI270468.1': 109,
'KI270507.1': 110,
'KI270508.1': 111,
'KI270509.1': 112,
'KI270510.1': 113,
'KI270511.1': 114,
'KI270512.1': 115,
'KI270515.1': 116,
'KI270516.1': 117,
'KI270517.1': 118,
'KI270518.1': 119,
'KI270519.1': 120,
'KI270521.1': 121,
'KI270522.1': 122,
'KI270528.1': 123,
'KI270529.1': 124,
'KI270530.1': 125,
'KI270538.1': 126,
'KI270539.1': 127,
'KI270544.1': 128,
'KI270548.1': 129,
'KI270579.1': 130,
'KI270580.1': 131,
'KI270581.1': 132,
'KI270582.1': 133,
'KI270583.1': 134,
'KI270584.1': 135,
'KI270587.1': 136,
'KI270588.1': 137,
'KI270589.1': 138,
'KI270590.1': 139,
'KI270591.1': 140,
'KI270593.1': 141,
'KI270706.1': 142,
'KI270707.1': 143,
'KI270708.1': 144,
'KI270709.1': 145,
'KI270710.1': 146,
'KI270711.1': 147,
'KI270712.1': 148,
'KI270713.1': 149,
'KI270714.1': 150,
'KI270715.1': 151,
'KI270716.1': 152,
'KI270717.1': 153,
'KI270718.1': 154,
'KI270719.1': 155,
'KI270720.1': 156,
'KI270721.1': 157,
'KI270722.1': 158,
'KI270723.1': 159,
'KI270724.1': 160,
'KI270725.1': 161,
'KI270726.1': 162,
'KI270727.1': 163,
'KI270728.1': 164,
'KI270729.1': 165,
'KI270730.1': 166,
'KI270731.1': 167,
'KI270732.1': 168,
'KI270733.1': 169,
'KI270734.1': 170,
'KI270735.1': 171,
'KI270736.1': 172,
'KI270737.1': 173,
'KI270738.1': 174,
'KI270739.1': 175,
'KI270740.1': 176,
'KI270741.1': 177,
'KI270742.1': 178,
'KI270743.1': 179,
'KI270744.1': 180,
'KI270745.1': 181,
'KI270746.1': 182,
'KI270747.1': 183,
'KI270748.1': 184,
'KI270749.1': 185,
'KI270750.1': 186,
'KI270751.1': 187,
'KI270752.1': 188,
'KI270753.1': 189,
'KI270754.1': 190,
'KI270755.1': 191,
'KI270756.1': 192,
'KI270757.1': 193,
}
this.indexToChr = [
{ refName: 'chr1', length: 248956422 },
{ refName: 'chr2', length: 242193529 },
{ refName: 'chr3', length: 198295559 },
{ refName: 'chr4', length: 190214555 },
{ refName: 'chr5', length: 181538259 },
{ refName: 'chr6', length: 170805979 },
{ refName: 'chr7', length: 159345973 },
{ refName: 'chr8', length: 145138636 },
{ refName: 'chr9', length: 138394717 },
{ refName: 'chr10', length: 133797422 },
{ refName: 'chr11', length: 135086622 },
{ refName: 'chr12', length: 133275309 },
{ refName: 'chr13', length: 114364328 },
{ refName: 'chr14', length: 107043718 },
{ refName: 'chr15', length: 101991189 },
{ refName: 'chr16', length: 90338345 },
{ refName: 'chr17', length: 83257441 },
{ refName: 'chr18', length: 80373285 },
{ refName: 'chr19', length: 58617616 },
{ refName: 'chr20', length: 64444167 },
{ refName: 'chr21', length: 46709983 },
{ refName: 'chr22', length: 50818468 },
{ refName: 'chrX', length: 156040895 },
{ refName: 'chrY', length: 57227415 },
{ refName: 'chrM', length: 16569 },
{ refName: 'GL000008.2', length: 209709 },
{ refName: 'GL000009.2', length: 201709 },
{ refName: 'GL000194.1', length: 191469 },
{ refName: 'GL000195.1', length: 182896 },
{ refName: 'GL000205.2', length: 185591 },
{ refName: 'GL000208.1', length: 92689 },
{ refName: 'GL000213.1', length: 164239 },
{ refName: 'GL000214.1', length: 137718 },
{ refName: 'GL000216.2', length: 176608 },
{ refName: 'GL000218.1', length: 161147 },
{ refName: 'GL000219.1', length: 179198 },
{ refName: 'GL000220.1', length: 161802 },
{ refName: 'GL000221.1', length: 155397 },
{ refName: 'GL000224.1', length: 179693 },
{ refName: 'GL000225.1', length: 211173 },
{ refName: 'GL000226.1', length: 15008 },
{ refName: 'KI270302.1', length: 2274 },
{ refName: 'KI270303.1', length: 1942 },
{ refName: 'KI270304.1', length: 2165 },
{ refName: 'KI270305.1', length: 1472 },
{ refName: 'KI270310.1', length: 1201 },
{ refName: 'KI270311.1', length: 12399 },
{ refName: 'KI270312.1', length: 998 },
{ refName: 'KI270315.1', length: 2276 },
{ refName: 'KI270316.1', length: 1444 },
{ refName: 'KI270317.1', length: 37690 },
{ refName: 'KI270320.1', length: 4416 },
{ refName: 'KI270322.1', length: 21476 },
{ refName: 'KI270329.1', length: 1040 },
{ refName: 'KI270330.1', length: 1652 },
{ refName: 'KI270333.1', length: 2699 },
{ refName: 'KI270334.1', length: 1368 },
{ refName: 'KI270335.1', length: 1048 },
{ refName: 'KI270336.1', length: 1026 },
{ refName: 'KI270337.1', length: 1121 },
{ refName: 'KI270338.1', length: 1428 },
{ refName: 'KI270340.1', length: 1428 },
{ refName: 'KI270362.1', length: 3530 },
{ refName: 'KI270363.1', length: 1803 },
{ refName: 'KI270364.1', length: 2855 },
{ refName: 'KI270366.1', length: 8320 },
{ refName: 'KI270371.1', length: 2805 },
{ refName: 'KI270372.1', length: 1650 },
{ refName: 'KI270373.1', length: 1451 },
{ refName: 'KI270374.1', length: 2656 },
{ refName: 'KI270375.1', length: 2378 },
{ refName: 'KI270376.1', length: 1136 },
{ refName: 'KI270378.1', length: 1048 },
{ refName: 'KI270379.1', length: 1045 },
{ refName: 'KI270381.1', length: 1930 },
{ refName: 'KI270382.1', length: 4215 },
{ refName: 'KI270383.1', length: 1750 },
{ refName: 'KI270384.1', length: 1658 },
{ refName: 'KI270385.1', length: 990 },
{ refName: 'KI270386.1', length: 1788 },
{ refName: 'KI270387.1', length: 1537 },
{ refName: 'KI270388.1', length: 1216 },
{ refName: 'KI270389.1', length: 1298 },
{ refName: 'KI270390.1', length: 2387 },
{ refName: 'KI270391.1', length: 1484 },
{ refName: 'KI270392.1', length: 971 },
{ refName: 'KI270393.1', length: 1308 },
{ refName: 'KI270394.1', length: 970 },
{ refName: 'KI270395.1', length: 1143 },
{ refName: 'KI270396.1', length: 1880 },
{ refName: 'KI270411.1', length: 2646 },
{ refName: 'KI270412.1', length: 1179 },
{ refName: 'KI270414.1', length: 2489 },
{ refName: 'KI270417.1', length: 2043 },
{ refName: 'KI270418.1', length: 2145 },
{ refName: 'KI270419.1', length: 1029 },
{ refName: 'KI270420.1', length: 2321 },
{ refName: 'KI270422.1', length: 1445 },
{ refName: 'KI270423.1', length: 981 },
{ refName: 'KI270424.1', length: 2140 },
{ refName: 'KI270425.1', length: 1884 },
{ refName: 'KI270429.1', length: 1361 },
{ refName: 'KI270435.1', length: 92983 },
{ refName: 'KI270438.1', length: 112505 },
{ refName: 'KI270442.1', length: 392061 },
{ refName: 'KI270448.1', length: 7992 },
{ refName: 'KI270465.1', length: 1774 },
{ refName: 'KI270466.1', length: 1233 },
{ refName: 'KI270467.1', length: 3920 },
{ refName: 'KI270468.1', length: 4055 },
{ refName: 'KI270507.1', length: 5353 },
{ refName: 'KI270508.1', length: 1951 },
{ refName: 'KI270509.1', length: 2318 },
{ refName: 'KI270510.1', length: 2415 },
{ refName: 'KI270511.1', length: 8127 },
{ refName: 'KI270512.1', length: 22689 },
{ refName: 'KI270515.1', length: 6361 },
{ refName: 'KI270516.1', length: 1300 },
{ refName: 'KI270517.1', length: 3253 },
{ refName: 'KI270518.1', length: 2186 },
{ refName: 'KI270519.1', length: 138126 },
{ refName: 'KI270521.1', length: 7642 },
{ refName: 'KI270522.1', length: 5674 },
{ refName: 'KI270528.1', length: 2983 },
{ refName: 'KI270529.1', length: 1899 },
{ refName: 'KI270530.1', length: 2168 },
{ refName: 'KI270538.1', length: 91309 },
{ refName: 'KI270539.1', length: 993 },
{ refName: 'KI270544.1', length: 1202 },
{ refName: 'KI270548.1', length: 1599 },
{ refName: 'KI270579.1', length: 31033 },
{ refName: 'KI270580.1', length: 1553 },
{ refName: 'KI270581.1', length: 7046 },
{ refName: 'KI270582.1', length: 6504 },
{ refName: 'KI270583.1', length: 1400 },
{ refName: 'KI270584.1', length: 4513 },
{ refName: 'KI270587.1', length: 2969 },
{ refName: 'KI270588.1', length: 6158 },
{ refName: 'KI270589.1', length: 44474 },
{ refName: 'KI270590.1', length: 4685 },
{ refName: 'KI270591.1', length: 5796 },
{ refName: 'KI270593.1', length: 3041 },
{ refName: 'KI270706.1', length: 175055 },
{ refName: 'KI270707.1', length: 32032 },
{ refName: 'KI270708.1', length: 127682 },
{ refName: 'KI270709.1', length: 66860 },
{ refName: 'KI270710.1', length: 40176 },
{ refName: 'KI270711.1', length: 42210 },
{ refName: 'KI270712.1', length: 176043 },
{ refName: 'KI270713.1', length: 40745 },
{ refName: 'KI270714.1', length: 41717 },
{ refName: 'KI270715.1', length: 161471 },
{ refName: 'KI270716.1', length: 153799 },
{ refName: 'KI270717.1', length: 40062 },
{ refName: 'KI270718.1', length: 38054 },
{ refName: 'KI270719.1', length: 176845 },
{ refName: 'KI270720.1', length: 39050 },
{ refName: 'KI270721.1', length: 100316 },
{ refName: 'KI270722.1', length: 194050 },
{ refName: 'KI270723.1', length: 38115 },
{ refName: 'KI270724.1', length: 39555 },
{ refName: 'KI270725.1', length: 172810 },
{ refName: 'KI270726.1', length: 43739 },
{ refName: 'KI270727.1', length: 448248 },
{ refName: 'KI270728.1', length: 1872759 },
{ refName: 'KI270729.1', length: 280839 },
{ refName: 'KI270730.1', length: 112551 },
{ refName: 'KI270731.1', length: 150754 },
{ refName: 'KI270732.1', length: 41543 },
{ refName: 'KI270733.1', length: 179772 },
{ refName: 'KI270734.1', length: 165050 },
{ refName: 'KI270735.1', length: 42811 },
{ refName: 'KI270736.1', length: 181920 },
{ refName: 'KI270737.1', length: 103838 },
{ refName: 'KI270738.1', length: 99375 },
{ refName: 'KI270739.1', length: 73985 },
{ refName: 'KI270740.1', length: 37240 },
{ refName: 'KI270741.1', length: 157432 },
{ refName: 'KI270742.1', length: 186739 },
{ refName: 'KI270743.1', length: 210658 },
{ refName: 'KI270744.1', length: 168472 },
{ refName: 'KI270745.1', length: 41891 },
{ refName: 'KI270746.1', length: 66486 },
{ refName: 'KI270747.1', length: 198735 },
{ refName: 'KI270748.1', length: 93321 },
{ refName: 'KI270749.1', length: 158759 },
{ refName: 'KI270750.1', length: 148850 },
{ refName: 'KI270751.1', length: 150742 },
{ refName: 'KI270752.1', length: 27745 },
{ refName: 'KI270753.1', length: 62944 },
{ refName: 'KI270754.1', length: 40191 },
{ refName: 'KI270755.1', length: 36723 },
{ refName: 'KI270756.1', length: 79590 },
{ refName: 'KI270757.1', length: 71251 },
]
}
if (!this.index) {
return
}
const indexData = await this.index.parse(opts)
const ret = indexData.firstDataLine
? indexData.firstDataLine.blockPosition + 65535
: undefined
let buffer
if (ret) {
const s = ret + blockLen
// console.log(`[bam-js] reading header [ ret ${ret} | s ${s} ]`)
const res = await this.bam.read(Buffer.alloc(s), 0, s, 0, opts)
if (!res.bytesRead) {
throw new Error('Error reading header')
}
buffer = res.buffer.subarray(0, Math.min(res.bytesRead, ret))
// console.log(`[bam-js] reading header [ res.bytesRead ${res.bytesRead} ]`)
} else {
// console.log(`[bam-js] reading all of header`)
buffer = await this.bam.readFile(opts)
}
const uncba = await unzip(buffer)
if (uncba.readInt32LE(0) !== BAM_MAGIC) {
throw new Error('Not a BAM file')
}
const headLen = uncba.readInt32LE(4)
// console.log(`[bam-js] headLen ${headLen}`);
this.header = uncba.toString('utf8', 8, 8 + headLen)
const { chrToIndex, indexToChr } = await this._readRefSeqs(
headLen + 8,
65535,
opts,
)
this.chrToIndex = chrToIndex
this.indexToChr = indexToChr
// console.log(`this.chrToIndex ${JSON.stringify(this.chrToIndex)}`)
// console.log(`this.indexToChr ${JSON.stringify(this.indexToChr)}`)
return parseHeaderText(this.header)
}
getHeader(opts?: BaseOpts) {
if (!this.headerP) {
this.headerP = this.getHeaderPre(opts).catch(e => {
this.headerP = undefined
throw e
})
}
return this.headerP
}
async getHeaderText(opts: BaseOpts = {}) {
await this.getHeader(opts)
return this.header
}
// the full length of the refseq block is not given in advance so this grabs
// a chunk and doubles it if all refseqs haven't been processed
async _readRefSeqs(
start: number,
refSeqBytes: number,
opts?: BaseOpts,
): Promise<{
chrToIndex: Record<string, number>
indexToChr: { refName: string; length: number }[]
}> {
if (start > refSeqBytes) {
return this._readRefSeqs(start, refSeqBytes * 2, opts)
}
const size = refSeqBytes + blockLen
const { bytesRead, buffer } = await this.bam.read(
Buffer.alloc(size),
0,
refSeqBytes,
0,
opts,
)
if (!bytesRead) {
throw new Error('Error reading refseqs from header')
}
const uncba = await unzip(
buffer.subarray(0, Math.min(bytesRead, refSeqBytes)),
)
const nRef = uncba.readInt32LE(start)
let p = start + 4
const chrToIndex: Record<string, number> = {}
const indexToChr: { refName: string; length: number }[] = []
for (let i = 0; i < nRef; i += 1) {
const lName = uncba.readInt32LE(p)
const refName = this.renameRefSeq(
uncba.toString('utf8', p + 4, p + 4 + lName - 1),
)
const lRef = uncba.readInt32LE(p + lName + 4)
chrToIndex[refName] = i
indexToChr.push({ refName, length: lRef })
p = p + 8 + lName
if (p > uncba.length) {
console.warn(
`BAM header is very big. Re-fetching ${refSeqBytes} bytes.`,
)
return this._readRefSeqs(start, refSeqBytes * 2, opts)
}
}
// console.log(`[bam-js] chrToIndex: ${JSON.stringify(chrToIndex)}`)
// console.log(`[bam-js] indexToChr: ${JSON.stringify(indexToChr)}`)
return { chrToIndex, indexToChr }
}
async getRecordsForRange(
chr: string,
min: number,
max: number,
opts?: BamOpts,
) {
return gen2array(this.streamRecordsForRange(chr, min, max, opts))
}
async *streamRecordsForRange(
chr: string,
min: number,
max: number,
opts?: BamOpts,
) {
// console.log(
// `[bam-js] streamRecordsForRange | ${chr} | ${min} | ${max} | ${JSON.stringify(opts)}`,
// )
// console.log(`[bam-js] opts?.assemblyName ${opts?.assemblyName}`)
if (opts?.assemblyName && opts?.assemblyName !== 'hg38') {
await this.getHeader(opts)
}
const chrId = this.chrToIndex?.[chr]
if (chrId === undefined || !this.index) {
yield []
} else {
const chunks = await this.index.blocksForRange(chrId, min - 1, max, opts)
yield* this._fetchChunkFeatures(chunks, chrId, min, max, opts)
}
}
async *_fetchChunkFeatures(
chunks: Chunk[],
chrId: number,
min: number,
max: number,
opts: BamOpts = {},
) {
const { viewAsPairs } = opts
const feats = [] as BAMFeature[][]
let done = false
for (const chunk of chunks) {
const records = await this.featureCache.get(
chunk.toString(),
{ chunk, opts },
opts.signal,
)
const recs = [] as BAMFeature[]
for (const feature of records) {
if (feature.seq_id() === chrId) {
if (feature.get('start') >= max) {
// past end of range, can stop iterating
done = true
break
} else if (feature.get('end') >= min) {
// must be in range
recs.push(feature)
}
}
}
feats.push(recs)
yield recs
if (done) {
break
}
}
checkAbortSignal(opts.signal)
if (viewAsPairs) {
yield this.fetchPairs(chrId, feats, opts)
}
}
async fetchPairs(chrId: number, feats: BAMFeature[][], opts: BamOpts) {
const { pairAcrossChr, maxInsertSize = 200000 } = opts
const unmatedPairs: Record<string, boolean> = {}
const readIds: Record<string, number> = {}
feats.map(ret => {
const readNames: Record<string, number> = {}
for (const element of ret) {
const name = element.name()
const id = element.id()
if (!readNames[name]) {
readNames[name] = 0
}
readNames[name]++
readIds[id] = 1
}
for (const [k, v] of Object.entries(readNames)) {
if (v === 1) {
unmatedPairs[k] = true
}
}
})
const matePromises: Promise<Chunk[]>[] = []
feats.map(ret => {
for (const f of ret) {
const name = f.name()
const start = f.get('start')
const pnext = f._next_pos()
const rnext = f._next_refid()
if (
this.index &&
unmatedPairs[name] &&
(pairAcrossChr ||
(rnext === chrId && Math.abs(start - pnext) < maxInsertSize))
) {
matePromises.push(
this.index.blocksForRange(rnext, pnext, pnext + 1, opts),
)
}
}
})
// filter out duplicate chunks (the blocks are lists of chunks, blocks are
// concatenated, then filter dup chunks)
const map = new Map<string, Chunk>()
const res = await Promise.all(matePromises)
for (const m of res.flat()) {
if (!map.has(m.toString())) {
map.set(m.toString(), m)
}
}
const mateFeatPromises = await Promise.all(
[...map.values()].map(async c => {
const { data, cpositions, dpositions, chunk } = await this._readChunk({
chunk: c,
opts,
})
const mateRecs = [] as BAMFeature[]
for (const feature of await this.readBamFeatures(
data,
cpositions,
dpositions,
chunk,
)) {
if (unmatedPairs[feature.get('name')] && !readIds[feature.id()]) {
mateRecs.push(feature)
}
}
return mateRecs
}),
)
return mateFeatPromises.flat()
}
async _readRegion(position: number, size: number, opts: BaseOpts = {}) {
const { bytesRead, buffer } = await this.bam.read(
Buffer.alloc(size),
0,
size,
position,
opts,
)
return buffer.subarray(0, Math.min(bytesRead, size))
}
async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) {
const buffer = await this._readRegion(
chunk.minv.blockPosition,
chunk.fetchedSize(),
opts,
)
const {
buffer: data,
cpositions,
dpositions,
} = await unzipChunkSlice(buffer, chunk)
return { data, cpositions, dpositions, chunk }
}
async readBamFeatures(
ba: Buffer,
cpositions: number[],
dpositions: number[],
chunk: Chunk,
) {
let blockStart = 0
const sink = [] as BAMFeature[]
let pos = 0
let last = +Date.now()
while (blockStart + 4 < ba.length) {
const blockSize = ba.readInt32LE(blockStart)
const blockEnd = blockStart + 4 + blockSize - 1
// increment position to the current decompressed status
if (dpositions) {
while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]) {}
pos--
}
// only try to read the feature if we have all the bytes for it
if (blockEnd < ba.length) {
const feature = new BAMFeature({
bytes: {
byteArray: ba,
start: blockStart,
end: blockEnd,
},
// the below results in an automatically calculated file-offset based
// ID if the info for that is available, otherwise crc32 of the
// features
//
// cpositions[pos] refers to actual file offset of a bgzip block
// boundaries
//
// we multiply by (1 <<8) in order to make sure each block has a
// "unique" address space so that data in that block could never
// overlap
//
// then the blockStart-dpositions is an uncompressed file offset from
// that bgzip block boundary, and since the cpositions are multiplied
// by (1 << 8) these uncompressed offsets get a unique space
//
// this has an extra chunk.minv.dataPosition added on because it
// blockStart starts at 0 instead of chunk.minv.dataPosition
//
// the +1 is just to avoid any possible uniqueId 0 but this does not
// realistically happen
fileOffset:
cpositions.length > 0
? cpositions[pos] * (1 << 8) +
(blockStart - dpositions[pos]) +
chunk.minv.dataPosition +
1
: // must be slice, not subarray for buffer polyfill on web
crc32.signed(ba.slice(blockStart, blockEnd)),
})
sink.push(feature)
if (this.yieldThreadTime && +Date.now() - last > this.yieldThreadTime) {
await timeout(1)
last = +Date.now()
}
}
blockStart = blockEnd + 1
}
return sink
}
async hasRefSeq(seqName: string) {
const seqId = this.chrToIndex?.[seqName]
return seqId === undefined ? false : this.index?.hasRefSeq(seqId)
}
async lineCount(seqName: string) {
const seqId = this.chrToIndex?.[seqName]
return seqId === undefined || !this.index ? 0 : this.index.lineCount(seqId)
}
async indexCov(seqName: string, start?: number, end?: number) {
if (!this.index) {
return []
}
await this.index.parse()
const seqId = this.chrToIndex?.[seqName]
return seqId === undefined ? [] : this.index.indexCov(seqId, start, end)
}
async blocksForRange(
seqName: string,
start: number,
end: number,
opts?: BaseOpts,
) {
if (!this.index) {
return []
}
await this.index.parse()
const seqId = this.chrToIndex?.[seqName]
return seqId === undefined
? []
: this.index.blocksForRange(seqId, start, end, opts)
}
}