Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adaptation to use generic-filehandle2 #150

Merged
merged 1 commit into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
## [1.6.1](https://github.com/GMOD/tabix-js/compare/v1.6.0...v1.6.1) (2024-12-07)



# [1.6.0](https://github.com/GMOD/tabix-js/compare/v1.5.15...v1.6.0) (2024-11-30)



## [1.5.15](https://github.com/GMOD/tabix-js/compare/v1.5.14...v1.5.15) (2024-08-30)

## [1.5.14](https://github.com/GMOD/tabix-js/compare/v1.5.13...v1.5.14) (2024-07-23)
Expand Down
9 changes: 3 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
],
"dependencies": {
"@gmod/abortable-promise-cache": "^2.0.0",
"@gmod/bgzf-filehandle": "^1.3.3",
"generic-filehandle": "^3.0.0",
"@gmod/bgzf-filehandle": "^2.0.0",
"generic-filehandle2": "^0.0.1",
"long": "^4.0.0",
"quick-lru": "^4.0.0"
},
Expand All @@ -55,16 +55,13 @@
"@typescript-eslint/eslint-plugin": "^8.0.1",
"@typescript-eslint/parser": "^8.0.1",
"@vitest/coverage-v8": "^2.0.5",
"buffer": "^6.0.3",
"documentation": "^14.0.3",
"eslint": "^9.9.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.0.1",
"eslint-plugin-unicorn": "^56.0.0",
"prettier": "^3.3.3",
"rimraf": "^6.0.1",
"standard-changelog": "^6.0.0",
"typescript": "~5.6.0",
"typescript": "^5.7.0",
"typescript-eslint": "^8.0.1",
"vitest": "^2.0.5",
"webpack": "^5.93.0",
Expand Down
89 changes: 49 additions & 40 deletions src/csi.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import Long from 'long'
import { Buffer } from 'buffer'
import { unzip } from '@gmod/bgzf-filehandle'

import VirtualOffset, { fromBytes } from './virtualOffset'
Expand All @@ -11,6 +10,12 @@ import IndexFile, { Options } from './indexFile'
const CSI1_MAGIC = 21582659 // CSI\1
const CSI2_MAGIC = 38359875 // CSI\2

const formats = {
0: 'generic',
1: 'SAM',
2: 'VCF',
}

function lshift(num: number, bits: number) {
return num * 2 ** bits
}
Expand Down Expand Up @@ -49,26 +54,27 @@ export default class CSI extends IndexFile {
throw new Error('CSI indexes do not support indexcov')
}

parseAuxData(bytes: Buffer, offset: number) {
const formatFlags = bytes.readInt32LE(offset)
parseAuxData(bytes: Uint8Array, offset: number) {
const dataView = new DataView(bytes.buffer)
const formatFlags = dataView.getInt32(offset, true)
const coordinateType =
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
const format = { 0: 'generic', 1: 'SAM', 2: 'VCF' }[formatFlags & 0xf]
const format = formats[(formatFlags & 0xf) as 0 | 1 | 2]
if (!format) {
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
}
const columnNumbers = {
ref: bytes.readInt32LE(offset + 4),
start: bytes.readInt32LE(offset + 8),
end: bytes.readInt32LE(offset + 12),
ref: dataView.getInt32(offset + 4, true),
start: dataView.getInt32(offset + 8, true),
end: dataView.getInt32(offset + 12, true),
}
const metaValue = bytes.readInt32LE(offset + 16)
const metaValue = dataView.getInt32(offset + 16, true)
const metaChar = metaValue ? String.fromCharCode(metaValue) : null
const skipLines = bytes.readInt32LE(offset + 20)
const nameSectionLength = bytes.readInt32LE(offset + 24)
const skipLines = dataView.getInt32(offset + 20, true)
const nameSectionLength = dataView.getInt32(offset + 24, true)

const { refIdToName, refNameToId } = this._parseNameBytes(
bytes.slice(offset + 28, offset + 28 + nameSectionLength),
bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
)

return {
Expand All @@ -82,47 +88,52 @@ export default class CSI extends IndexFile {
}
}

_parseNameBytes(namesBytes: Buffer) {
_parseNameBytes(namesBytes: Uint8Array) {
let currRefId = 0
let currNameStart = 0
const refIdToName = []
const refNameToId: Record<string, number> = {}
const decoder = new TextDecoder('utf8')
for (let i = 0; i < namesBytes.length; i += 1) {
if (!namesBytes[i]) {
if (currNameStart < i) {
let refName = namesBytes.toString('utf8', currNameStart, i)
refName = this.renameRefSeq(refName)
const refName = this.renameRefSeq(
decoder.decode(namesBytes.subarray(currNameStart, i)),
)
refIdToName[currRefId] = refName
refNameToId[refName] = currRefId
}
currNameStart = i + 1
currRefId += 1
}
}
return { refNameToId, refIdToName }
return {
refNameToId,
refIdToName,
}
}

// fetch and parse the index

async _parse(opts: Options = {}) {
const bytes = await unzip(await this.filehandle.readFile(opts))
const dataView = new DataView(bytes.buffer)

// check TBI magic numbers
let csiVersion
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
if (dataView.getUint32(0, true) === CSI1_MAGIC) {
csiVersion = 1
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
} else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
csiVersion = 2
} else {
throw new Error('Not a CSI file')
// TODO: do we need to support big-endian CSI files?
}

this.minShift = bytes.readInt32LE(4)
this.depth = bytes.readInt32LE(8)
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const maxRefLength = 2 ** (this.minShift + this.depth * 3)
const auxLength = bytes.readInt32LE(12)
const auxLength = dataView.getInt32(12, true)
const aux =
auxLength && auxLength >= 30
? this.parseAuxData(bytes, 16)
Expand All @@ -134,35 +145,33 @@ export default class CSI extends IndexFile {
coordinateType: 'zero-based-half-open',
format: 'generic',
}
const refCount = bytes.readInt32LE(16 + auxLength)
const refCount = dataView.getInt32(16 + auxLength, true)

// read the indexes for each reference sequence
let firstDataLine: VirtualOffset | undefined
let currOffset = 16 + auxLength + 4
const indices = new Array(refCount).fill(0).map(() => {
// the binning index
const binCount = bytes.readInt32LE(currOffset)
const binCount = dataView.getInt32(currOffset, true)
currOffset += 4
const binIndex: Record<string, Chunk[]> = {}
let stats // < provided by parsing a pseudo-bin, if present
let stats
for (let j = 0; j < binCount; j += 1) {
const bin = bytes.readUInt32LE(currOffset)
const bin = dataView.getUint32(currOffset, true)
if (bin > this.maxBinNumber) {
// this is a fake bin that actually has stats information
// about the reference sequence in it
// this is a fake bin that actually has stats information about the
// reference sequence in it
stats = this.parsePseudoBin(bytes, currOffset + 4)
currOffset += 4 + 8 + 4 + 16 + 16
} else {
const loffset = fromBytes(bytes, currOffset + 4)
firstDataLine = this._findFirstData(firstDataLine, loffset)
const chunkCount = bytes.readInt32LE(currOffset + 12)
const chunkCount = dataView.getInt32(currOffset + 12, true)
currOffset += 16
const chunks = new Array(chunkCount)
for (let k = 0; k < chunkCount; k += 1) {
const u = fromBytes(bytes, currOffset)
const v = fromBytes(bytes, currOffset + 8)
currOffset += 16
// this._findFirstData(data, u)
chunks[k] = new Chunk(u, v, bin)
}
binIndex[bin] = chunks
Expand All @@ -186,14 +195,15 @@ export default class CSI extends IndexFile {
}
}

parsePseudoBin(bytes: Buffer, offset: number) {
const lineCount = longToNumber(
Long.fromBytesLE(
bytes.slice(offset + 28, offset + 36) as unknown as number[],
true,
parsePseudoBin(bytes: Uint8Array, offset: number) {
return {
lineCount: longToNumber(
Long.fromBytesLE(
bytes.subarray(offset + 28, offset + 36) as unknown as number[],
true,
),
),
)
return { lineCount }
}
}

async blocksForRange(
Expand All @@ -216,9 +226,8 @@ export default class CSI extends IndexFile {
return []
}

// const { linearIndex, binIndex } = indexes

const overlappingBins = this.reg2bins(min, max) // List of bin #s that overlap min, max
// List of bin #s that overlap min, max
const overlappingBins = this.reg2bins(min, max)
const chunks: Chunk[] = []

// Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned
Expand Down
2 changes: 1 addition & 1 deletion src/indexFile.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { GenericFilehandle } from 'generic-filehandle'
import { GenericFilehandle } from 'generic-filehandle2'
import VirtualOffset from './virtualOffset'
import Chunk from './chunk'

Expand Down
48 changes: 15 additions & 33 deletions src/tabixIndexedFile.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
import LRU from 'quick-lru'
import { Buffer } from 'buffer'
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle'
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle2'
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
import { checkAbortSignal } from './util'
import IndexFile, { Options, IndexData } from './indexFile'
Expand All @@ -17,17 +16,14 @@ function isASCII(str: string) {

type GetLinesCallback = (line: string, fileOffset: number) => void

const decoder =
typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined

interface GetLinesOpts {
[key: string]: unknown
signal?: AbortSignal
lineCallback: GetLinesCallback
}

interface ReadChunk {
buffer: Buffer
buffer: Uint8Array
cpositions: number[]
dpositions: number[]
}
Expand Down Expand Up @@ -196,6 +192,7 @@ export default class TabixIndexedFile {

const chunks = await this.index.blocksForRange(refName, start, end, options)
checkAbortSignal(signal)
const decoder = new TextDecoder('utf8')

// now go through each chunk and parse and filter the lines out of it
for (const c of chunks) {
Expand All @@ -209,11 +206,11 @@ export default class TabixIndexedFile {
let blockStart = 0
let pos = 0

const str = decoder?.decode(buffer) ?? buffer.toString()
// fast path, Buffer is just ASCII chars and not gigantor, can be
// converted to string and processed directly. if it is not ASCII or
// gigantic (chrome max str len is 512Mb), we have to decode line by line
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
const str = decoder.decode(buffer)
const strIsASCII = isASCII(str)
while (blockStart < str.length) {
let line: string
let n: number
Expand All @@ -224,12 +221,12 @@ export default class TabixIndexedFile {
}
line = str.slice(blockStart, n)
} else {
n = buffer.indexOf('\n', blockStart)
n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
if (n === -1) {
break
}
const b = buffer.slice(blockStart, n)
line = decoder?.decode(b) ?? b.toString()
line = decoder.decode(b)
}

// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
Expand Down Expand Up @@ -292,10 +289,10 @@ export default class TabixIndexedFile {
checkAbortSignal(opts.signal)

const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
// TODO: what if we don't have a firstDataLine, and the header
// actually takes up more than one block? this case is not covered here
// TODO: what if we don't have a firstDataLine, and the header actually
// takes up more than one block? this case is not covered here

const buf = await this._readRegion(0, maxFetch, opts)
const buf = await this.filehandle.read(maxFetch, 0, opts)
const bytes = await unzip(buf)

// trim off lines after the last non-meta line
Expand Down Expand Up @@ -324,8 +321,9 @@ export default class TabixIndexedFile {
* @returns {Promise} for a string
*/
async getHeader(opts: Options = {}) {
const decoder = new TextDecoder('utf8')
const bytes = await this.getHeaderBuffer(opts)
return bytes.toString('utf8')
return decoder.decode(bytes)
}

/**
Expand Down Expand Up @@ -492,32 +490,16 @@ export default class TabixIndexedFile {
return this.index.lineCount(refName, opts)
}

async _readRegion(pos: number, size: number, opts: Options = {}) {
const b = Buffer.alloc(size)
const { bytesRead, buffer } = await this.filehandle.read(
b,
0,
size,
pos,
opts,
)

return buffer.subarray(0, bytesRead)
}

/**
* read and uncompress the data in a chunk (composed of one or more
* contiguous bgzip blocks) of the file
*/
async readChunk(c: Chunk, opts: Options = {}) {
// fetch the uncompressed data, uncompress carefully a block at a time, and
// stop when done

const data = await this._readRegion(
c.minv.blockPosition,
const ret = await this.filehandle.read(
c.fetchedSize(),
c.minv.blockPosition,
opts,
)
return unzipChunkSlice(data, c)
return unzipChunkSlice(ret, c)
}
}
Loading
Loading