@lenne.tech/cli
Version:
lenne.Tech CLI: lt
219 lines (218 loc) • 9.24 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getMarkerStatus = getMarkerStatus;
exports.installMarker = installMarker;
exports.resolveDevice = resolveDevice;
exports.runMarker = runMarker;
/**
* marker-pdf integration for the lt CLI.
*
* Marker (https://github.com/datalab-to/marker) is a PyTorch-based
* PDF → Markdown converter with first-class layout, table and equation
* support. On Apple Silicon it leverages Metal Performance Shaders
* (MPS) for GPU-accelerated inference.
*
* The CLI keeps marker in an isolated Python virtualenv under
* `~/.lt/marker/.venv/` so that:
* - we do not pollute the user's global Python environment
* - the ~3 GB of model weights are downloaded only once
* - subsequent runs start instantly (cached models)
*/
const child_process_1 = require("child_process");
const fs_1 = require("fs");
const promises_1 = require("fs/promises");
const os_1 = require("os");
const path_1 = require("path");
const util_1 = require("util");
const execAsync = (0, util_1.promisify)(child_process_1.exec);
const MARKER_HOME = (0, path_1.join)((0, os_1.homedir)(), '.lt', 'marker');
const VENV_DIR = (0, path_1.join)(MARKER_HOME, '.venv');
const VENV_BIN = (0, path_1.join)(VENV_DIR, 'bin');
const VENV_PYTHON = (0, path_1.join)(VENV_BIN, 'python3');
const VENV_MARKER_SINGLE = (0, path_1.join)(VENV_BIN, 'marker_single');
const VENV_MARKER_BATCH = (0, path_1.join)(VENV_BIN, 'marker');
/**
* Detect tool availability.
*/
function getMarkerStatus() {
return __awaiter(this, void 0, void 0, function* () {
const status = {
installed: false,
pythonAvailable: false,
uvAvailable: false,
venvPath: VENV_DIR,
};
try {
yield execAsync('python3 --version');
status.pythonAvailable = true;
}
catch (_a) {
// python3 missing
}
try {
yield execAsync('uv --version');
status.uvAvailable = true;
}
catch (_b) {
// uv missing — we'll fall back to python -m venv + pip
}
status.installed = (0, fs_1.existsSync)(VENV_MARKER_SINGLE) && (0, fs_1.existsSync)(VENV_MARKER_BATCH);
return status;
});
}
/**
* Install marker-pdf into ~/.lt/marker/.venv.
*
* Preferred path: `uv venv --python 3.12` + `uv pip install marker-pdf psutil`.
* Fallback: `python3 -m venv` + `pip install`.
*/
function installMarker() {
return __awaiter(this, arguments, void 0, function* (opts = {}) {
var _a;
const log = (_a = opts.onProgress) !== null && _a !== void 0 ? _a : (() => { });
const status = yield getMarkerStatus();
if (status.installed) {
log('marker already installed');
return;
}
if (!status.pythonAvailable) {
throw new Error('python3 is required but not found in PATH. Install Python 3.10+ (e.g. via Homebrew: `brew install python@3.12`)');
}
yield (0, promises_1.mkdir)(MARKER_HOME, { recursive: true });
const useUv = status.uvAvailable;
// 1. Create virtualenv
if (useUv) {
log('Creating venv with uv (Python 3.12)…');
yield execAsync(`uv venv --python 3.12 "${VENV_DIR}"`, { cwd: MARKER_HOME });
}
else {
log('Creating venv with python3 (uv not found, falling back)…');
yield execAsync(`python3 -m venv "${VENV_DIR}"`, { cwd: MARKER_HOME });
}
// 2. Install marker-pdf + psutil
// psutil is needed by the marker batch CLI; it is a soft dep on some
// marker-pdf releases, so we install it explicitly.
// We use shell quoting to handle macOS "Library" / spaces in paths.
const cmd = useUv
? `uv pip install --python "${VENV_PYTHON}" marker-pdf psutil`
: `"${VENV_BIN}/pip" install marker-pdf psutil`;
log('Installing marker-pdf + dependencies (~3 GB models will download on first run)…');
// Increase maxBuffer because pip output is large
yield execAsync(cmd, { cwd: MARKER_HOME, maxBuffer: 100 * 1024 * 1024 });
if (!(0, fs_1.existsSync)(VENV_MARKER_SINGLE)) {
throw new Error(`marker installation finished but ${VENV_MARKER_SINGLE} not found`);
}
log('marker installed successfully');
});
}
/**
* Decide the correct TORCH_DEVICE for this machine.
*/
function resolveDevice(requested = 'auto') {
if (requested !== 'auto')
return requested;
if (process.platform === 'darwin' && process.arch === 'arm64')
return 'mps';
// We don't probe nvidia-smi here — let PyTorch decide CUDA at runtime
return 'cpu';
}
/**
* Run marker on a single PDF or a directory of PDFs.
*/
function runMarker(inputPath, opts) {
return __awaiter(this, void 0, void 0, function* () {
var _a;
const status = yield getMarkerStatus();
if (!status.installed) {
throw new Error('marker is not installed. Run `lt tools ocr --install` first.');
}
const isDir = (0, fs_1.existsSync)(inputPath) && (yield Promise.resolve().then(() => __importStar(require('fs')))).statSync(inputPath).isDirectory();
const bin = isDir ? VENV_MARKER_BATCH : VENV_MARKER_SINGLE;
const args = [];
if (isDir) {
args.push(inputPath);
}
else {
args.push(inputPath);
}
args.push('--output_dir', opts.outputDir);
args.push('--output_format', (_a = opts.outputFormat) !== null && _a !== void 0 ? _a : 'markdown');
if (opts.disableImages)
args.push('--disable_image_extraction');
if (isDir) {
if (opts.skipExisting)
args.push('--skip_existing');
if (opts.workers && opts.workers > 0)
args.push('--workers', String(opts.workers));
}
const device = resolveDevice(opts.device);
return new Promise((resolve, reject) => {
var _a;
const proc = (0, child_process_1.spawn)(bin, args, {
env: Object.assign(Object.assign({}, process.env), { TORCH_DEVICE: device }),
stdio: ['ignore', 'pipe', 'pipe'],
});
const onLine = (_a = opts.onLine) !== null && _a !== void 0 ? _a : ((l) => process.stdout.write(`${l}\n`));
const handleStream = (stream) => {
let buf = '';
stream.on('data', (chunk) => {
var _a;
buf += chunk.toString();
const lines = buf.split(/\r?\n/);
buf = (_a = lines.pop()) !== null && _a !== void 0 ? _a : '';
for (const line of lines)
if (line)
onLine(line);
});
};
handleStream(proc.stdout);
handleStream(proc.stderr);
proc.on('close', (code) => {
resolve({ exitCode: code !== null && code !== void 0 ? code : 0 });
});
proc.on('error', reject);
});
});
}