UNPKG

@syngrisi/syngrisi

Version:
464 lines (406 loc) 21.1 kB
/* eslint-disable @typescript-eslint/no-explicit-any */ import { promises as fsp } from 'fs'; import path from 'path'; import mongoose from 'mongoose'; import { config } from '@config'; import { subDays, dateToISO8601 } from '@utils'; import { createTable } from '@utils/stringTable'; import { IOutputWriter } from '../lib/output-writer'; import { Snapshot, Check, Baseline, } from '../lib'; function parseHrtimeToSeconds(hrtime: [number, number]): string { return (hrtime[0] + (hrtime[1] / 1e9)).toFixed(3); } const normalizeId = (id: unknown): string | null => { if (!id) { return null; } try { return id.toString(); } catch (error) { return null; } }; async function countPngFiles(dirPath: string): Promise<number> { const dir = await fsp.opendir(dirPath); let count = 0; try { for await (const dirent of dir) { if (!dirent.isDirectory() && dirent.name.endsWith('.png')) { count += 1; } } } finally { try { await dir.close(); } catch (error) { const err = error as NodeJS.ErrnoException; if (err?.code !== 'ERR_DIR_CLOSED') { throw error; } } } return count; } async function collectBaselineSnapshotIds(): Promise<string[]> { const aggregation = Baseline.aggregate([ { $match: { snapshootId: { $ne: null } } }, { $group: { _id: '$snapshootId' } }, { $project: { _id: 1 } } ]); const results = await aggregation.exec(); return results .map(doc => normalizeId(doc._id)) .filter((id): id is string => Boolean(id)); } type CheckSnapshotMatch = Record<string, unknown>; async function collectCheckSnapshotIds(matchFilter: CheckSnapshotMatch = {}) { const buildPipeline = (field: 'baselineId' | 'actualSnapshotId' | 'diffId') => ([ { $match: { ...matchFilter, [field]: { $ne: null } } }, { $group: { _id: `$${field}` } }, { $project: { _id: 1 } } ]); const normalizeDocs = (docs: { _id: unknown }[]) => docs .map((doc) => normalizeId(doc._id)) .filter((id): id is string => Boolean(id)); const [ baselineResults, actualResults, diffResults, ] = await Promise.all([ Check.aggregate(buildPipeline('baselineId')).exec(), Check.aggregate(buildPipeline('actualSnapshotId')).exec(), Check.aggregate(buildPipeline('diffId')).exec(), ]); return { baselineIds: normalizeDocs(baselineResults), actualIds: normalizeDocs(actualResults), diffIds: normalizeDocs(diffResults), }; } async function deleteFilesWithLimit(files: string[], limit: number) { let index = 0; let success = 0; const failures: { file: string; reason: unknown }[] = []; async function worker() { while (true) { let current: string | undefined; // Extract next filename if (index < files.length) { current = files[index]; index += 1; } if (!current) { break; } try { await fsp.unlink(path.join(config.defaultImagesPath, current)); success += 1; } catch (error) { failures.push({ file: current, reason: error }); } } } const workers = Array.from({ length: Math.min(limit, files.length) }, () => worker()); await Promise.all(workers); return { success, failures }; } export interface HandleOldChecksOptions { days: number; remove: boolean; } /** * Handle old checks task * Removes checks and related items that are older than specified days * * IMPORTANT: Baseline records are NEVER removed automatically by this task. * Baselines represent the reference/golden images and should not be lost. * Only Checks and their associated Snapshots (actual, diff) are removed. * Baseline snapshots are preserved if they are still referenced by any Baseline. * * PERFORMANCE OPTIMIZATIONS: * - Uses countDocuments() instead of loading full collections for statistics * - Uses aggregation pipelines with $group to collect unique IDs (avoids 16MB distinct limit) * - Only loads required fields (projections) when full documents are needed * - Processes file operations in batches to avoid memory spikes * - Reuses computed data to avoid redundant queries * - Aggregation pipelines work efficiently even with millions of documents * * RECOMMENDED DATABASE INDEXES for optimal performance: * - Check.createdDate (for date-based queries) * - Check.baselineId, Check.actualSnapshotId, Check.diffId (for distinct queries) * - Snapshot.filename (for distinct filename queries) * - Baseline.snapshootId (for baseline preservation checks) * * @param options - Task options * @param output - Output writer for streaming results */ export async function handleOldChecksTask( options: HandleOldChecksOptions, output: IOutputWriter ): Promise<void> { try { const startTime = process.hrtime(); output.write('- starting...\n'); // Validate that the images directory exists try { await fsp.access(config.defaultImagesPath); output.write(`> validated images directory: ${config.defaultImagesPath}`); } catch (error) { throw new Error(`Images directory does not exist or is not accessible: ${config.defaultImagesPath}`); } output.write('STAGE #1 Calculate common stats'); const trashHoldDate = subDays(new Date(), options.days); output.write('> count all checks'); const allChecksCountBefore = await Check.countDocuments().exec(); output.write('> count snapshots'); const allSnapshotsCountBefore = await Snapshot.countDocuments().exec(); output.write('> get files data'); const allFilesBefore = await countPngFiles(config.defaultImagesPath); output.write('> count old checks'); const oldChecksCount = await Check.countDocuments({ createdDate: { $lt: trashHoldDate } }).exec(); output.write('>>> collect all baselineId snapshot IDs from old Checks '); // Use aggregation to avoid 16MB distinct limit const baselineIdResults = await Check.aggregate([ { $match: { createdDate: { $lt: trashHoldDate }, baselineId: { $ne: null } } }, { $group: { _id: '$baselineId' } }, { $project: { _id: 1 } } ]).exec(); const oldChecksBaselineSnapshotIds = baselineIdResults .map(doc => normalizeId(doc._id)) .filter((id): id is string => Boolean(id)); output.write('>>> collect all actualSnapshotId from old Checks '); const actualSnapshotIdResults = await Check.aggregate([ { $match: { createdDate: { $lt: trashHoldDate }, actualSnapshotId: { $ne: null } } }, { $group: { _id: '$actualSnapshotId' } }, { $project: { _id: 1 } } ]).exec(); const oldChecksActualSnapshotIds = actualSnapshotIdResults .map(doc => normalizeId(doc._id)) .filter((id): id is string => Boolean(id)); output.write('>>> collect all diffId snapshot IDs from old Checks '); const diffIdResults = await Check.aggregate([ { $match: { createdDate: { $lt: trashHoldDate }, diffId: { $ne: null } } }, { $group: { _id: '$diffId' } }, { $project: { _id: 1 } } ]).exec(); const oldChecksDiffSnapshotIds = diffIdResults .map(doc => normalizeId(doc._id)) .filter((id): id is string => Boolean(id)); output.write('>>> calculate all unique snapshots ids for old Checks '); const allOldSnapshotsUniqueIds = Array.from(new Set([ ...oldChecksBaselineSnapshotIds.filter(x => x != null), ...oldChecksActualSnapshotIds.filter(x => x != null), ...oldChecksDiffSnapshotIds.filter(x => x != null) ])); output.write('>>> collect filenames from old snapshots'); // Only load filenames, not entire snapshot documents const oldSnapshotsData = await Snapshot.find( { _id: { $in: allOldSnapshotsUniqueIds } }, { filename: 1 } ).lean().exec() as { filename?: string }[]; // Calculate total size of old snapshot files output.write('>>> calculate total size of old snapshot files'); const oldSnapshotsFilenames = Array.from(new Set(oldSnapshotsData.map(x => x.filename).filter((f): f is string => !!f))); let totalOldFilesSize = 0; // Process files in batches to avoid too many concurrent operations const BATCH_SIZE = 100; for (let i = 0; i < oldSnapshotsFilenames.length; i += BATCH_SIZE) { const batch = oldSnapshotsFilenames.slice(i, i + BATCH_SIZE); const batchResults = await Promise.allSettled( batch.map(async (filename) => { const filePath = path.join(config.defaultImagesPath, filename); const stats = await fsp.stat(filePath); return stats.size; }) ); for (const result of batchResults) { if (result.status === 'fulfilled') { totalOldFilesSize += result.value; } // Silently skip files that don't exist } } const totalOldFilesSizeGB = (totalOldFilesSize / (1024 * 1024 * 1024)).toFixed(3); const outTable = createTable([ { item: 'all checks', count: allChecksCountBefore }, { item: 'all snapshots', count: allSnapshotsCountBefore }, { item: 'all files', count: allFilesBefore }, { item: `checks older than: '${options.days}' days`, count: oldChecksCount }, { item: 'old checks baseline snapshot ids', count: oldChecksBaselineSnapshotIds.length }, { item: 'old checks actual snapshot ids', count: oldChecksActualSnapshotIds.length }, { item: 'old checks diff snapshot ids', count: oldChecksDiffSnapshotIds.length }, { item: 'all old snapshots unique Ids', count: allOldSnapshotsUniqueIds.length }, { item: 'old snapshot filenames', count: oldSnapshotsFilenames.length }, { item: 'total size of old files', count: `${totalOldFilesSizeGB} GB` }, ]); output.write(outTable); if (options.remove) { output.write(`STAGE #2 Remove checks that older that: '${options.days}' days, '${dateToISO8601(trashHoldDate)}'\n`); // Check if MongoDB is running as a replica set (required for transactions) let session: mongoose.ClientSession | null = null; let useTransactions = false; try { // Check MongoDB topology to determine if transactions are supported const admin = mongoose.connection.db?.admin(); const serverInfo = await admin?.serverStatus(); const isReplicaSet = serverInfo?.repl?.setName !== undefined; if (isReplicaSet) { session = await mongoose.startSession(); session.startTransaction(); useTransactions = true; output.write('> using transactions for data consistency (replica set detected)'); } else { output.write('> standalone MongoDB detected, proceeding without transactions'); } } catch (error) { output.write('> could not determine MongoDB topology, proceeding without transactions'); session = null; } let collectedCheckSnapshotIds: Awaited<ReturnType<typeof collectCheckSnapshotIds>> | null = null; let collectedBaselineSnapshotIds: string[] = []; try { output.write('> collect current snapshot references'); collectedCheckSnapshotIds = await collectCheckSnapshotIds({ createdDate: { $gte: trashHoldDate } }); collectedBaselineSnapshotIds = await collectBaselineSnapshotIds(); output.write('>>> snapshot references collected'); } catch (collectError) { output.write('>>> failed to collect snapshot references'); throw collectError; } try { output.write('> remove checks'); const checkRemovingResult = useTransactions && session ? await Check.deleteMany({ createdDate: { $lt: trashHoldDate } }, { session }) : await Check.deleteMany({ createdDate: { $lt: trashHoldDate } }); output.write(`>>> removed: '${checkRemovingResult.deletedCount}'`); output.write('> remove snapshots'); if (!collectedCheckSnapshotIds) { throw new Error('snapshot ids were not collected'); } const checksBaselineSnapshotIds = new Set(collectedCheckSnapshotIds.baselineIds); const checksActualSnapshotIds = new Set(collectedCheckSnapshotIds.actualIds); const baselinesSnapshotIds = new Set(collectedBaselineSnapshotIds); output.write('>> remove baselines snapshots'); output.write('>> remove all old snapshots that not related to new baseline and check items'); const deletableBaselineSnapshots = oldChecksBaselineSnapshotIds.filter((id) => { if (!id) return false; return !checksBaselineSnapshotIds.has(id) && !checksActualSnapshotIds.has(id) && !baselinesSnapshotIds.has(id); }); const removedByBaselineSnapshotsResult = useTransactions && session ? await Snapshot.deleteMany({ _id: { $in: deletableBaselineSnapshots } }, { session }) : await Snapshot.deleteMany({ _id: { $in: deletableBaselineSnapshots } }); output.write(`>>> removed: '${removedByBaselineSnapshotsResult.deletedCount}'`); output.write('>> remove actual snapshots'); output.write('>> remove all old snapshots that not related to new baseline and check items'); const deletableActualSnapshots = oldChecksActualSnapshotIds.filter((id) => { if (!id) return false; return !checksBaselineSnapshotIds.has(id) && !checksActualSnapshotIds.has(id) && !baselinesSnapshotIds.has(id); }); const removedByActualSnapshotsResult = useTransactions && session ? await Snapshot.deleteMany({ _id: { $in: deletableActualSnapshots } }, { session }) : await Snapshot.deleteMany({ _id: { $in: deletableActualSnapshots } }); output.write(`>>> removed: '${removedByActualSnapshotsResult.deletedCount}'`); output.write('>> remove all old diff snapshots'); // NOTE: Diff snapshots are temporary comparison artifacts and are not referenced by Baselines. // Baselines only reference golden/baseline images via snapshootId field, never diff images. // Therefore, diff snapshots can be safely deleted without checking Baseline references. const removedByDiffSnapshotsResult = useTransactions && session ? await Snapshot.deleteMany({ $and: [ { _id: { $in: oldChecksDiffSnapshotIds } }, ], }, { session }) : await Snapshot.deleteMany({ $and: [ { _id: { $in: oldChecksDiffSnapshotIds } }, ], }); output.write(`>>> removed: '${removedByDiffSnapshotsResult.deletedCount}'`); // Commit the transaction after all DB operations (if using transactions) if (useTransactions && session) { await session.commitTransaction(); output.write('>>> Database transaction committed successfully'); } output.write('> remove files'); output.write('>>> using previously collected old snapshots filenames'); const oldSnapshotsUniqueFilenames = oldSnapshotsFilenames; output.write(`>> found: ${oldSnapshotsUniqueFilenames.length}`); output.write('> get all current snapshots filenames'); // Use aggregation to avoid 16MB distinct limit const currentFilenamesResults = await Snapshot.aggregate([ { $match: { filename: { $ne: null } } }, { $group: { _id: '$filename' } }, { $project: { _id: 1 } } ]).exec(); const allCurrentSnapshotsFilenames = currentFilenamesResults.map(doc => doc._id as string); const currentSnapshotsSet = new Set(allCurrentSnapshotsFilenames); output.write('>> calculate intersection between all current snapshot filenames and old snapshots filenames'); const filesIntersection = oldSnapshotsUniqueFilenames.filter((filename) => currentSnapshotsSet.has(filename)); output.write(`>> found: ${filesIntersection.length}`); output.write('>> calculate filenames to remove'); let filesToDelete = oldSnapshotsUniqueFilenames.filter((filename) => !currentSnapshotsSet.has(filename)); output.write(`>> found: ${filesToDelete.length}`); // Re-check current snapshots right before deletion to prevent race condition output.write('>> re-validating files to delete to prevent race condition'); const revalidateFilenamesResults = await Snapshot.aggregate([ { $match: { filename: { $ne: null } } }, { $group: { _id: '$filename' } }, { $project: { _id: 1 } } ]).exec(); const currentSnapshotsBeforeDeletion = new Set(revalidateFilenamesResults.map(doc => doc._id as string)); filesToDelete = filesToDelete.filter((filename: string) => !currentSnapshotsBeforeDeletion.has(filename)); output.write(`>> validated: ${filesToDelete.length} files safe to delete`); output.write(`>> remove these files: ${filesToDelete.length}`); const { success, failures } = await deleteFilesWithLimit(filesToDelete, 25); if (failures.length > 0) { output.write(`>> warning: ${failures.length} files failed to delete:`); failures.forEach((failure) => { output.write(` - ${failure.file}: ${failure.reason}`); }); } output.write(`>> done: ${success} files deleted successfully, ${failures.length} failed`); output.write('STAGE #3 Calculate common stats after Removing'); output.write('> count all checks'); const allChecksCountAfter = await Check.countDocuments().exec(); output.write('> count snapshots'); const allSnapshotsCountAfter = await Snapshot.countDocuments().exec(); output.write('> get files data'); const allFilesAfter = await countPngFiles(config.defaultImagesPath); const outTableAfter = createTable([ { item: 'all checks', count: allChecksCountAfter }, { item: 'all snapshots', count: allSnapshotsCountAfter }, { item: 'all files', count: allFilesAfter }, ]); output.write(outTableAfter); } catch (operationError) { output.write('>>> Error during operation...'); if (useTransactions && session) { output.write('>>> Rolling back transaction...'); await session.abortTransaction(); } throw operationError; } finally { if (session) { session.endSession(); } } } const elapsedSeconds = parseHrtimeToSeconds(process.hrtime(startTime)); const elapsedMinutes = (parseFloat(elapsedSeconds) / 60).toFixed(2); output.write(`> done in ${elapsedSeconds} seconds (${elapsedMinutes} min)`); } catch (e: unknown) { const errMsg = e instanceof Error ? e.message : String(e); output.write(errMsg); throw e; } finally { output.end(); } }