UNPKG

@lenne.tech/cli

Version:

lenne.Tech CLI: lt

219 lines (218 loc) 9.24 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.getMarkerStatus = getMarkerStatus; exports.installMarker = installMarker; exports.resolveDevice = resolveDevice; exports.runMarker = runMarker; /** * marker-pdf integration for the lt CLI. * * Marker (https://github.com/datalab-to/marker) is a PyTorch-based * PDF → Markdown converter with first-class layout, table and equation * support. On Apple Silicon it leverages Metal Performance Shaders * (MPS) for GPU-accelerated inference. * * The CLI keeps marker in an isolated Python virtualenv under * `~/.lt/marker/.venv/` so that: * - we do not pollute the user's global Python environment * - the ~3 GB of model weights are downloaded only once * - subsequent runs start instantly (cached models) */ const child_process_1 = require("child_process"); const fs_1 = require("fs"); const promises_1 = require("fs/promises"); const os_1 = require("os"); const path_1 = require("path"); const util_1 = require("util"); const execAsync = (0, util_1.promisify)(child_process_1.exec); const MARKER_HOME = (0, path_1.join)((0, os_1.homedir)(), '.lt', 'marker'); const VENV_DIR = (0, path_1.join)(MARKER_HOME, '.venv'); const VENV_BIN = (0, path_1.join)(VENV_DIR, 'bin'); const VENV_PYTHON = (0, path_1.join)(VENV_BIN, 'python3'); const VENV_MARKER_SINGLE = (0, path_1.join)(VENV_BIN, 'marker_single'); const VENV_MARKER_BATCH = (0, path_1.join)(VENV_BIN, 'marker'); /** * Detect tool availability. */ function getMarkerStatus() { return __awaiter(this, void 0, void 0, function* () { const status = { installed: false, pythonAvailable: false, uvAvailable: false, venvPath: VENV_DIR, }; try { yield execAsync('python3 --version'); status.pythonAvailable = true; } catch (_a) { // python3 missing } try { yield execAsync('uv --version'); status.uvAvailable = true; } catch (_b) { // uv missing — we'll fall back to python -m venv + pip } status.installed = (0, fs_1.existsSync)(VENV_MARKER_SINGLE) && (0, fs_1.existsSync)(VENV_MARKER_BATCH); return status; }); } /** * Install marker-pdf into ~/.lt/marker/.venv. * * Preferred path: `uv venv --python 3.12` + `uv pip install marker-pdf psutil`. * Fallback: `python3 -m venv` + `pip install`. */ function installMarker() { return __awaiter(this, arguments, void 0, function* (opts = {}) { var _a; const log = (_a = opts.onProgress) !== null && _a !== void 0 ? _a : (() => { }); const status = yield getMarkerStatus(); if (status.installed) { log('marker already installed'); return; } if (!status.pythonAvailable) { throw new Error('python3 is required but not found in PATH. Install Python 3.10+ (e.g. via Homebrew: `brew install python@3.12`)'); } yield (0, promises_1.mkdir)(MARKER_HOME, { recursive: true }); const useUv = status.uvAvailable; // 1. Create virtualenv if (useUv) { log('Creating venv with uv (Python 3.12)…'); yield execAsync(`uv venv --python 3.12 "${VENV_DIR}"`, { cwd: MARKER_HOME }); } else { log('Creating venv with python3 (uv not found, falling back)…'); yield execAsync(`python3 -m venv "${VENV_DIR}"`, { cwd: MARKER_HOME }); } // 2. Install marker-pdf + psutil // psutil is needed by the marker batch CLI; it is a soft dep on some // marker-pdf releases, so we install it explicitly. // We use shell quoting to handle macOS "Library" / spaces in paths. const cmd = useUv ? `uv pip install --python "${VENV_PYTHON}" marker-pdf psutil` : `"${VENV_BIN}/pip" install marker-pdf psutil`; log('Installing marker-pdf + dependencies (~3 GB models will download on first run)…'); // Increase maxBuffer because pip output is large yield execAsync(cmd, { cwd: MARKER_HOME, maxBuffer: 100 * 1024 * 1024 }); if (!(0, fs_1.existsSync)(VENV_MARKER_SINGLE)) { throw new Error(`marker installation finished but ${VENV_MARKER_SINGLE} not found`); } log('marker installed successfully'); }); } /** * Decide the correct TORCH_DEVICE for this machine. */ function resolveDevice(requested = 'auto') { if (requested !== 'auto') return requested; if (process.platform === 'darwin' && process.arch === 'arm64') return 'mps'; // We don't probe nvidia-smi here — let PyTorch decide CUDA at runtime return 'cpu'; } /** * Run marker on a single PDF or a directory of PDFs. */ function runMarker(inputPath, opts) { return __awaiter(this, void 0, void 0, function* () { var _a; const status = yield getMarkerStatus(); if (!status.installed) { throw new Error('marker is not installed. Run `lt tools ocr --install` first.'); } const isDir = (0, fs_1.existsSync)(inputPath) && (yield Promise.resolve().then(() => __importStar(require('fs')))).statSync(inputPath).isDirectory(); const bin = isDir ? VENV_MARKER_BATCH : VENV_MARKER_SINGLE; const args = []; if (isDir) { args.push(inputPath); } else { args.push(inputPath); } args.push('--output_dir', opts.outputDir); args.push('--output_format', (_a = opts.outputFormat) !== null && _a !== void 0 ? _a : 'markdown'); if (opts.disableImages) args.push('--disable_image_extraction'); if (isDir) { if (opts.skipExisting) args.push('--skip_existing'); if (opts.workers && opts.workers > 0) args.push('--workers', String(opts.workers)); } const device = resolveDevice(opts.device); return new Promise((resolve, reject) => { var _a; const proc = (0, child_process_1.spawn)(bin, args, { env: Object.assign(Object.assign({}, process.env), { TORCH_DEVICE: device }), stdio: ['ignore', 'pipe', 'pipe'], }); const onLine = (_a = opts.onLine) !== null && _a !== void 0 ? _a : ((l) => process.stdout.write(`${l}\n`)); const handleStream = (stream) => { let buf = ''; stream.on('data', (chunk) => { var _a; buf += chunk.toString(); const lines = buf.split(/\r?\n/); buf = (_a = lines.pop()) !== null && _a !== void 0 ? _a : ''; for (const line of lines) if (line) onLine(line); }); }; handleStream(proc.stdout); handleStream(proc.stderr); proc.on('close', (code) => { resolve({ exitCode: code !== null && code !== void 0 ? code : 0 }); }); proc.on('error', reject); }); }); }