iterparse
Version:
Delightful data parsing
200 lines (168 loc) • 6.23 kB
text/typescript
import { AsyncIterable } from "ix"
import * as P from 'ts-prime'
import { GroupedItems } from "./fileGroupBy"
import { formatBytes } from "./helpers"
import { AnyIterable } from "./types"
export interface TrailingGroupByArgs<T> {
groupBy: ((data: T) => string | number) | ((data: T) => Promise<string | number>);
maxGroupSize: number;
totalItemsInMemory: number
}
function _trailingGroupBy<T>(data: globalThis.AsyncIterable<T>, args: TrailingGroupByArgs<T>) {
async function* iter() {
const stats = {
totalItemsInMemory: 0
}
const groups: Record<string, Array<T>> = {}
for await (const item of data) {
const id = await args.groupBy(item).toString()
if (groups[id] == null) {
groups[id] = []
}
if (stats.totalItemsInMemory >= args.totalItemsInMemory) {
const item = P.first(P.maxBy(Object.entries(groups), ([, items]) => items.length))
if (item == null) {
stats.totalItemsInMemory = 0
} else {
const [key, items] = item
yield {
key,
items
}
groups[key] = []
}
}
groups[id].push(item)
stats.totalItemsInMemory += 1
if (groups[id].length >= args.maxGroupSize) {
yield {
key: id,
items: groups[id]
}
stats.totalItemsInMemory -= groups[id].length
groups[id] = []
}
continue
}
}
return AsyncIterable.from(iter())
}
export function trailingGroupBy<T>(args: TrailingGroupByArgs<T>): (data: AnyIterable<T>) => AsyncIterable<GroupedItems<T>>
export function trailingGroupBy<T>(data: AnyIterable<T>, args: TrailingGroupByArgs<T>): AsyncIterable<GroupedItems<T>>
export function trailingGroupBy() {
return P.purry(_trailingGroupBy, arguments)
}
export interface TrailingMapArgs<T, R> {
mapFunc: (data: T) => Promise<R>
maxConcurrency: number
}
function _trailingMap<T, R>(data: AnyIterable<T>, args: TrailingMapArgs<T, R>) {
async function* iter() {
let done = false
const iter = AsyncIterable.from(data)[Symbol.asyncIterator]()
const requestQueue: Array<{ id: number, request: Promise<{ id: number, result: R }> }> = []
while (!done) {
const id = Date.now()
let value = iter.next().then(async (q) => {
const req = await args.mapFunc(q.value)
if (q.done) {
done = true
}
return {
id,
result: req
}
})
requestQueue.push({
id, request: value
})
if (requestQueue.length === args.maxConcurrency) {
const result = await Promise.race(requestQueue.map((q) => q.request))
requestQueue.splice(requestQueue.findIndex((q) => q.id === result.id), 1)
}
}
while (requestQueue.length !== 0) {
const result = await Promise.race(requestQueue.map((q) => q.request))
requestQueue.splice(requestQueue.findIndex((q) => q.id === result.id), 1)
}
}
return AsyncIterable.from(iter())
}
export function trailingMap<T, R>(args: TrailingMapArgs<T, R>): (data: AnyIterable<T>) => AsyncIterable<R>
export function trailingMap<T, R>(data: AnyIterable<T>, args: TrailingMapArgs<T, R>): AsyncIterable<R>
export function trailingMap() {
return P.purry(_trailingMap, arguments)
}
function _onDone<T>(data: AnyIterable<T>, callback: () => void) {
async function* iter() {
for await (const item of data) {
yield item
}
callback()
}
return AsyncIterable.from(iter())
}
export function onDone<T>(callback: () => void): (data: AnyIterable<T>) => AsyncIterable<T>
export function onDone<T>(data: AnyIterable<T>, callback: () => void): AsyncIterable<T>
export function onDone() {
return P.purry(_onDone, arguments)
}
class ProgressTrack {
items = 0
isRunning = false
startTime = 0
rollingDurations: number[] = []
addItem() {
if (this.startTime !== 0) {
this.rollingDurations.push(Date.now() - this.startTime)
if (this.rollingDurations.length >= 20) {
this.rollingDurations.shift()
}
}
this.items += 1
this.isRunning = true
this.startTime = Date.now()
}
get average() {
const mean = P.stats(this.rollingDurations, (q) => q).arithmetic_mean
return mean
}
}
class Progress {
constructor(private progress: ProgressTrack) { }
toString() {
const speed = this.progress.items > 1 ? ` Speed: ${(1 / (this.progress.average / 1000)).toFixed(2)} items/s,` : ""
return `Items: ${this.progress.items.toLocaleString()},${speed} Memory: ${formatBytes(process.memoryUsage().heapUsed)}`
}
toJSON() {
return {
speed: 1 / (this.progress.average / 1000),
items: this.progress.items
}
}
}
interface OnProgressArgs {
progress: (data: Progress) => void
progressFrequency?: number
}
function _onProgress<T>(data: AnyIterable<T>, args: OnProgressArgs) {
const progressInstance = new ProgressTrack()
const interval = setInterval(() => {
args.progress(new Progress(progressInstance))
}, args.progressFrequency || 2000)
async function* iter() {
for await (const item of data) {
progressInstance.addItem()
yield item
}
}
args.progress(new Progress(progressInstance))
return AsyncIterable.from(iter()).finally(() => {
clearInterval(interval)
})
}
export function onProgress<T>(args: OnProgressArgs): (data: AnyIterable<T>) => AsyncIterable<T>
export function onProgress<T>(data: AnyIterable<T>, args: OnProgressArgs): AsyncIterable<T>
export function onProgress() {
return P.purry(_onProgress, arguments)
}