@attestate/crawler
Version:
@attestate/crawler is a tool chain to retrieve on-chain data from Ethereum.
429 lines (383 loc) • 11.9 kB
JavaScript
//@format
import { createInterface } from "readline";
import { createReadStream, appendFileSync } from "fs";
import { rm, rename } from "fs/promises";
import EventEmitter, { once } from "events";
import Ajv from "ajv";
import addFormats from "ajv-formats";
import util from "util";
import { execute } from "@attestate/extraction-worker";
import * as database from "./database.mjs";
import workerMessage from "./schemata/messages/worker.mjs";
import { NotFoundError } from "./errors.mjs";
import { inDataDir, fileExists } from "./disc.mjs";
import logger from "./logger.mjs";
export const EXTRACTOR_CODES = {
FAILURE: 1,
SHUTDOWN_IN_INIT: 2,
SHUTDOWN_IN_UPDATE: 3,
};
const log = logger("lifecycle");
const ajv = new Ajv();
addFormats(ajv);
function validateWorkerMessage(message) {
const workerValidator = ajv.compile(workerMessage);
const valid = workerValidator(message);
if (!valid) {
log("Found 1 or more validation error, ignoring worker message:", message);
log(workerValidator.errors);
return false;
}
return true;
}
export function prepareMessages(messages, commissioner) {
return messages
.map((message) => {
return {
commissioner,
...message,
};
})
.filter(validateWorkerMessage);
}
export async function transform(name, strategy, state) {
const inputPath = inDataDir(strategy.input.name);
if (!(await fileExists(inputPath))) {
log(
`Skipping "${name}" transformation as input path doesn't exist "${inputPath}"`
);
return;
}
const rl = createInterface({
input: createReadStream(inputPath),
crlfDelay: Infinity,
});
const outputPath = inDataDir(strategy.output.name);
let buffer = [];
rl.on("line", (line) => {
const stateCopy = { ...state, line };
const props = { args: strategy.args, state: stateCopy, execute };
const write = strategy.module.onLine(props);
if (write) {
appendFileSync(outputPath, `${write}\n`);
}
});
rl.on("error", (error) => {
log(`Error from rl in transform: ${error.stack}`);
});
await once(rl, "close");
const props = { args: strategy.args, state, execute };
const write = strategy.module.onClose(props);
if (write) {
appendFileSync(outputPath, `${write}\n`);
}
return buffer;
}
export async function extract(
name,
strategy,
worker,
messageRouter,
state,
config
) {
return await new Promise(async (resolve, reject) => {
// NOTE: This promise function swallows errors so as a work-around, we've
// added a huge try-catch
try {
let numberOfMessages = 0;
const type = "extraction";
const interval = setInterval(() => {
log(
`${name} extractor is running with ${numberOfMessages} messages pending`
);
}, 120_000);
let result;
const props = {
args: strategy.args,
state,
execute,
environment: config.environment,
};
try {
result = await strategy.module.init(props);
} catch (err) {
reject(err);
}
if (!result) {
const error = new Error(
`Strategy "${name}-extraction" didn't return a valid result: "${JSON.stringify(
result
)}"`
);
error.code = EXTRACTOR_CODES.FAILURE;
clearInterval(interval);
return reject(error);
}
const outputPath = inDataDir(strategy.output.name);
if (result.write) {
try {
appendFileSync(outputPath, `${result.write}\n`);
} catch (err) {
const error = new Error(
`Couldn't write to file after update. Output path: "${outputPath}", Content: "${result.write}"`
);
error.code = EXTRACTOR_CODES.FAILURE;
clearInterval(interval);
return reject(error);
}
}
const callback = async (message) => {
numberOfMessages--;
log(`Leftover Lifecycle Messages: ${numberOfMessages}`);
if (message.error) {
log(
`Received error message from worker for strategy "${name}": "${message.error}"`
);
} else {
let result;
const props = {
args: strategy.args,
state,
message,
execute,
environment: config.environment,
};
try {
result = await strategy.module.update(props);
} catch (err) {
reject(err);
}
if (!result) {
const error = new Error(
`Strategy "${name}-extraction" didn't return a valid result: "${JSON.stringify(
result
)}"`
);
error.code = EXTRACTOR_CODES.FAILURE;
messageRouter.off(`${name}-${type}`, callback);
clearInterval(interval);
return reject(error);
}
if (result.messages?.length !== 0) {
prepareMessages(result.messages, name).forEach((message) => {
numberOfMessages++;
worker.postMessage(message);
});
}
if (result.write) {
try {
appendFileSync(outputPath, `${result.write}\n`);
} catch (err) {
const error = new Error(
`Couldn't write to file after update. Output Path: "${outputPath}", Content: "${result.write}"`
);
error.code = EXTRACTOR_CODES.FAILURE;
messageRouter.off(`${name}-${type}`, callback);
clearInterval(interval);
return reject(error);
}
}
}
if (numberOfMessages === 0) {
log("Shutting down extraction in update callback function");
messageRouter.off(`${name}-${type}`, callback);
clearInterval(interval);
resolve({ code: EXTRACTOR_CODES.SHUTDOWN_IN_UPDATE });
}
};
messageRouter.on(`${name}-${type}`, callback);
let preparedMessages =
result.messages?.length !== 0
? prepareMessages(result.messages, name)
: 0;
if (preparedMessages.length > 0) {
preparedMessages.forEach((message) => {
numberOfMessages++;
worker.postMessage(message);
});
} else {
log("Shutting down extraction in init follow-up function");
messageRouter.off(`${name}-${type}`, callback);
clearInterval(interval);
resolve({ code: EXTRACTOR_CODES.SHUTDOWN_IN_INIT });
}
} catch (err) {
const message = `Err in extract promise: ${err.toString()} ${err.stack}`;
log(message);
reject(err);
}
});
}
export async function load(name, strategy, db, state) {
const inputPath = inDataDir(strategy.input.name);
if (!(await fileExists(inputPath))) {
log(
`Skipping "${name}" loading as input path doesn't exist "${inputPath}"`
);
return;
}
const rl = createInterface({
input: createReadStream(inputPath),
crlfDelay: Infinity,
});
for await (const line of rl) {
if (line === "") continue;
const stateCopy = { ...state, line };
const props = { args: strategy.args, state: stateCopy, execute };
for (const { key, value } of strategy.module.order(props)) {
await database.toOrder(db, name, key, value);
}
for (const { key, value } of strategy.module.direct(props)) {
await database.toDirect(db, name, key, value);
}
}
}
// Modified subscribe to return the listener function
function subscribe(messageRouter, worker) {
// Define the listener function
const messageListener = (message) => {
// NOTE: This is fatal and we can't continue
if (!message.commissioner) {
const errorMessage = `Can't redirect; message.commissioner is ${message.commissioner}`;
log(errorMessage);
throw new Error(errorMessage);
} else {
messageRouter.emit(`${message.commissioner}-extraction`, message);
}
};
// Attach the listener
worker.on("message", messageListener);
// Return the listener so it can be removed later
return messageListener;
}
export async function latest(db, name, config, module) {
const subdb = db.openDB(database.order(name));
const local = await module.local(subdb);
const remote = await module.remote({
environment: config.environment,
execute,
});
return {
local,
remote,
};
}
async function compute(db, name, config, module) {
if (!db || (!module && (!module.local || !module.remote))) {
return {};
}
return await latest(db, name, config, module);
}
export async function run(
strategy,
worker,
messageRouter,
config,
reinvocation = run
) {
let db;
if (strategy?.loader?.output?.name) {
const path = inDataDir(strategy.loader.output.name);
db = database.open(path);
}
const state = await compute(
db,
strategy.name,
config,
strategy.coordinator?.module
);
if (strategy.extractor) {
log(
`Starting extractor strategy "${
strategy.name
}" with params "${JSON.stringify(strategy.extractor.args)}"`
);
await extract(
strategy.name,
strategy.extractor,
worker,
messageRouter,
state,
config
);
log(`Ending extractor strategy "${strategy.name}"`);
}
if (strategy.transformer) {
log(`Starting transformer strategy "${strategy.name}"`);
await transform(strategy.name, strategy.transformer, state);
log(`Ending transformer strategy "${strategy.name}"`);
}
if (strategy.loader) {
log(`Starting loader strategy "${strategy.name}"`);
await load(strategy.name, strategy.loader, db, state);
log(`Ending loader strategy "${strategy.name}"`);
}
await tidy(strategy?.extractor?.output?.name, strategy?.coordinator?.archive);
await tidy(
strategy?.transformer?.output?.name,
strategy?.coordinator?.archive
);
if (strategy?.end && typeof strategy.end === "function") {
log(`Ending crawl iteration by calling end function`);
await strategy.end();
}
if (strategy.coordinator?.interval) {
log(`Waiting "${strategy.coordinator.interval}ms" to repeat the task`);
await new Promise((resolve) =>
setTimeout(resolve, strategy.coordinator.interval)
);
return reinvocation(strategy, worker, messageRouter, config, reinvocation);
}
}
export async function walk(worker, config, messageRouter) {
log(
`Starting to execute strategies with the following crawl path`,
util.inspect(config.path, {
depth: null,
colors: true,
breakLength: "Infinity",
compact: true,
})
);
return await Promise.allSettled(
config.path.map((strategy) => run(strategy, worker, messageRouter, config))
);
}
export async function tidy(name, archive) {
if (!name) {
log(`tidy: Archive name "${name}" is undefined. Skipping."`);
return;
}
const path = inDataDir(name);
if (!(await fileExists(path))) {
log(`Skipping "${path}" removal as path doesn't exist`);
return;
}
if (archive) {
log(`Renaming "${name}" outputs to then repeat task`);
const nextPath = (fileName) => inDataDir(`${Date.now()}_${fileName}`);
await rename(path, nextPath(name));
} else {
log(`Deleting "${name}" outputs to then repeat task`);
await rm(path);
}
}
// Modified init to capture and remove the listener
export async function init(worker, config) {
const messageRouter = new EventEmitter();
// Capture the listener function returned by subscribe
const workerListener = subscribe(messageRouter, worker);
try {
await walk(worker, config, messageRouter);
log("All strategies executed");
worker.postMessage({
type: "exit",
version: "0.0.1",
});
} finally {
// Ensure cleanup happens even if walk throws an error
log("Removing worker message listener");
worker.off("message", workerListener);
}
}