UNPKG

esearch-ocr

Version:

paddleocr models run on onnx

1,587 lines (1,381 loc) 59.4 kB
import { Cls } from "./cls"; import { newCanvas, setCanvas, toPaddleInput, type SessionType, type AsyncType, data2canvas, resizeImg, int, tLog, clip, } from "./untils"; import { type Contour, findContours, minAreaRect, type Point } from "./cv"; export { setOCREnv, init, /** @deprecated use return obj from init */ x as ocr, loadImg, /** @deprecated use return obj from init */ Det as det, /** @deprecated use return obj from init */ Rec as rec, afAfRec as analyzeLayout, initDet, initRec, initDocDirCls, rotateImg, }; export type initType = AsyncType<ReturnType<typeof init>>; export type { OrtOption, InitOcrBase, InitOcrGlobal, detResultType, resultType, loadImgType }; type ColumnsTip = { box: BoxType; type: "auto" | "ignore" | "table" | "raw" | "raw-blank" }[]; type OrtOption = { ort: typeof import("onnxruntime-common"); ortOption?: import("onnxruntime-common").InferenceSession.SessionOptions; }; type InitDetBase = { input: string | ArrayBufferLike | Uint8Array; ratio?: number; on?: (r: detResultType) => void; }; type InitRecBase = { input: string | ArrayBufferLike | Uint8Array; decodeDic: string; imgh?: number; on?: (index: number, result: { text: string; mean: number }, total: number) => void; optimize?: { space?: boolean; }; }; type InitDocClsBase = { input: string | ArrayBufferLike | Uint8Array; }; type InitOcrBase = { det: InitDetBase; rec: InitRecBase; docCls?: InitDocClsBase; analyzeLayout?: { docDirs?: ReadingDir[]; columnsTip?: ColumnsTip; }; dev?: boolean; log?: boolean; detRatio?: number; } & OrtOption; type InitOcrGlobal = { /** @deprecated use setOCREnv instead */ // biome-ignore lint/suspicious/noExplicitAny: <explanation> canvas?: (w: number, h: number) => any; /** @deprecated use setOCREnv instead */ imageData?; }; type loadImgType = string | HTMLImageElement | HTMLCanvasElement | ImageData; type detResultType = { box: BoxType; img: ImageData; style: { bg: color; text: color } }[]; type detDataType = { data: AsyncType<ReturnType<typeof runDet>>["data"]; width: number; height: number; }; type pointType = [number, number]; type BoxType = [pointType, pointType, pointType, pointType]; type pointsType = pointType[]; type resultType = { text: string; mean: number; box: BoxType; style: { bg: color; text: color } }[]; type ReadingDirPart = "lr" | "rl" | "tb" | "bt"; type ReadingDir = { block: ReadingDirPart; inline: ReadingDirPart; }; const task = new tLog("t"); const task2 = new tLog("af_det"); let dev = false; let canlog = false; let globalOCR: AsyncType<ReturnType<typeof initOCR>> | null = null; function putImgDom(img: OffscreenCanvas, id?: string) { const canvas = document.createElement("canvas"); canvas.width = img.width; canvas.height = img.height; canvas.getContext("2d")!.drawImage(img, 0, 0); if (id) canvas.id = id; try { document?.body?.append(canvas); } catch (error) {} } let createImageData = (data: Uint8ClampedArray, w: number, h: number) => { return new ImageData(data, w, h); }; function log(...args: any[]) { if (canlog) console.log(...args); } function logSrc(...args: any[]) { if (canlog) console.log(...args.map((i) => structuredClone(i))); } function logColor(...args: string[]) { if (canlog) { console.log(args.map((x) => `%c${x}`).join(""), ...args.map((x) => `color: ${x}`)); } } async function init( op: | InitOcrBase | ({ /** @deprecated use det.input */ detPath: string; /** @deprecated use rec.input */ recPath: string; /** @deprecated */ layoutPath?: string; /** @deprecated use docCls.input */ docClsPath?: string; /** @deprecated use rec.decodeDic */ dic: string; /** @deprecated */ layoutDic?: string; /** @deprecated use analyzeLayout.docDirs */ docDirs?: ReadingDir[]; /** @deprecated use analyzeLayout.columnsTip */ columnsTip?: ColumnsTip; dev?: boolean; log?: boolean; /** @deprecated use rec.imgh */ imgh?: number; /** @deprecated use det.ratio */ detRatio?: number; /** @deprecated use det.on and rec.on */ onProgress?: (type: "det" | "rec", total: number, count: number) => void; /** @deprecated use det.on */ onDet?: (r: detResultType) => void; /** @deprecated use rec.on */ onRec?: (index: number, result: { text: string; mean: number }) => void; } & InitOcrGlobal & OrtOption), ) { // 兼容老版本 setOCREnv(op); const xop: InitOcrBase = { det: "det" in op ? op.det : { input: op.detPath, ratio: op.detRatio, on: async (r) => { if (op.onDet) op.onDet(r); if (op.onProgress) op.onProgress("det", 1, 1); }, }, rec: "rec" in op ? op.rec : { input: op.recPath, decodeDic: op.dic, imgh: op.imgh, on: async (index, result, t) => { if (op.onRec) op.onRec(index, result); if (op.onProgress) op.onProgress("rec", t, index + 1); }, }, docCls: "rec" in op ? op.docCls : op.docClsPath ? { input: op.docClsPath, } : undefined, analyzeLayout: "rec" in op ? op.analyzeLayout : { columnsTip: op.columnsTip, docDirs: op.docDirs, }, ...op, }; const x = await initOCR(xop); globalOCR = x; return x; } function setOCREnv(op: { dev?: boolean; log?: boolean; // biome-ignore lint/suspicious/noExplicitAny: <explanation> canvas?: (w: number, h: number) => any; imageData?; }) { dev = Boolean(op.dev); canlog = dev || Boolean(op.log); if (!dev) { task.l = () => {}; task2.l = () => {}; } if (op.canvas) setCanvas(op.canvas); if (op.imageData) createImageData = op.imageData; } async function loadImg(src: loadImgType) { let img: HTMLImageElement | HTMLCanvasElement | ImageData; if (typeof window === "undefined") { const x = src as ImageData; if (!x.data || !x.width || !x.height) throw new Error("invalid image data"); return x; } if (typeof src === "string") { img = new Image(); img.src = src; await new Promise((resolve) => { (img as HTMLImageElement).onload = resolve; }); } else if (src instanceof ImageData) { img = src; } else { img = src; } if (img instanceof HTMLImageElement) { const canvas = newCanvas(img.naturalWidth, img.naturalHeight); const ctx = canvas.getContext("2d"); if (!ctx) throw new Error("canvas context is null"); ctx.drawImage(img, 0, 0); img = ctx.getImageData(0, 0, img.naturalWidth, img.naturalHeight); } if (img instanceof HTMLCanvasElement) { const ctx = img.getContext("2d"); if (!ctx) throw new Error("canvas context is null"); img = ctx.getImageData(0, 0, img.width, img.height); } return img; } function checkNode() { try { newCanvas(1, 1); createImageData(new Uint8ClampedArray(4), 1, 1); } catch (error) { console.log("nodejs need set canvas, please use setOCREnv to set canvas and imageData"); throw error; } } async function x(i: loadImgType) { if (!globalOCR) throw new Error("need init"); return globalOCR.ocr(i); } async function Det(s: ImageData) { if (!globalOCR) throw new Error("need init"); return globalOCR.det(s); } async function Rec(box: detResultType) { if (!globalOCR) throw new Error("need init"); return globalOCR.rec(box); } /** 主要操作 */ async function initOCR(op: InitOcrBase) { checkNode(); const ortO: OrtOption = { ort: op.ort, ortOption: op.ortOption, }; const docCls = op.docCls ? await initDocDirCls({ ...op.docCls, ...ortO }) : undefined; const det = await initDet({ ...op.det, ...ortO }); const rec = await initRec({ ...op.rec, ...ortO }); return { ocr: async (srcimg: loadImgType) => { let img = await loadImg(srcimg); let dir = 0; if (docCls) { dir = await docCls.docCls(img); log("dir", dir); img = rotateImg(img, 360 - dir); } const box = await det.det(img); const mainLine = await rec.rec(box); const newMainLine = afAfRec(mainLine, op.analyzeLayout); log(mainLine, newMainLine); task.l("end"); return { src: mainLine, ...newMainLine, docDir: dir }; }, det: det.det, rec: rec.rec, }; } function initOrtModel( ort: OrtOption["ort"], input: string | ArrayBufferLike | Uint32Array, ortOptions?: OrtOption["ortOption"], ) { if (typeof input === "string") { return ort.InferenceSession.create(input, ortOptions); } return ort.InferenceSession.create(input, ortOptions); } async function initDocDirCls(op: InitDocClsBase & OrtOption) { const cls = await initOrtModel(op.ort, op.input, op.ortOption); const docCls = async (img: ImageData) => { return Cls(img, op.ort, cls, [0, 90, 180, 270], 224, 224); }; return { docCls }; } async function initDet(op: InitDetBase & OrtOption) { checkNode(); let detRatio = 1; const det = await initOrtModel(op.ort, op.input, op.ortOption); if (op.ratio !== undefined) detRatio = op.ratio; async function Det(srcimg: ImageData) { const img = srcimg; if (dev) { const srcCanvas = data2canvas(img); putImgDom(srcCanvas); } task.l("pre_det"); const { data: beforeDetData, width: resizeW, height: resizeH } = beforeDet(img, detRatio); const { transposedData, image } = beforeDetData; task.l("det"); const detResults = await runDet(transposedData, image, det, op.ort); task.l("aft_det"); const box = afterDet( { data: detResults.data, width: detResults.dims[3], height: detResults.dims[2] }, resizeW, resizeH, img, ); op?.on?.(box); return box; } return { det: Det }; } async function initRec(op: InitRecBase & OrtOption) { checkNode(); let imgh = 48; const rec = await initOrtModel(op.ort, op.input, op.ortOption); const dic = op.decodeDic.split(/\r\n|\r|\n/) || []; if (dic.at(-1) === "") { // 多出的换行 dic[dic.length - 1] = " "; } else { dic.push(" "); } if (op.imgh) imgh = op.imgh; const opmSpace = op.optimize?.space === undefined ? true : op.optimize.space; async function Rec(box: detResultType) { const mainLine: resultType = []; task.l("bf_rec"); const recL = beforeRec(box, imgh); let runCount = 0; for (const [index, item] of recL.entries()) { const { b, imgH, imgW } = item; const recResults = await runRec(b, imgH, imgW, rec, op.ort); const result = afterRec(recResults, dic, { opm: { space: opmSpace } })[0]; mainLine.push({ text: result.text, mean: result.mean, box: box[index].box, style: box[index].style, }); op?.on?.(index, result, box.length); runCount++; } task.l("rec_end"); return mainLine.filter((x) => x.mean >= 0.5) as resultType; } return { rec: Rec }; } async function runDet(transposedData: number[][][], image: ImageData, det: SessionType, ort: OrtOption["ort"]) { const detData = Float32Array.from(transposedData.flat(3)); const detTensor = new ort.Tensor("float32", detData, [1, 3, image.height, image.width]); const detFeed = {}; detFeed[det.inputNames[0]] = detTensor; const detResults = await det.run(detFeed); return detResults[det.outputNames[0]]; } async function runRec(b: number[][][], imgH: number, imgW: number, rec: SessionType, ort: OrtOption["ort"]) { const recData = Float32Array.from(b.flat(3)); const recTensor = new ort.Tensor("float32", recData, [1, 3, imgH, imgW]); const recFeed = {}; recFeed[rec.inputNames[0]] = recTensor; const recResults = await rec.run(recFeed); return recResults[rec.outputNames[0]]; } function beforeDet(srcImg: ImageData, detRatio: number) { const resizeH = Math.max(Math.round((srcImg.height * detRatio) / 32) * 32, 32); const resizeW = Math.max(Math.round((srcImg.width * detRatio) / 32) * 32, 32); if (dev) { const srcCanvas = data2canvas(srcImg); putImgDom(srcCanvas); } const image = resizeImg(srcImg, resizeW, resizeH, "fill"); const transposedData = toPaddleInput(image, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]); log(image); if (dev) { const srcCanvas = data2canvas(image); putImgDom(srcCanvas); } return { data: { transposedData, image }, width: resizeW, height: resizeH }; } function afterDet(dataSet: detDataType, _resizeW: number, _resizeH: number, srcData: ImageData) { task2.l(""); // 考虑到fill模式,小的不变动 const w = Math.min(srcData.width, _resizeW); const h = Math.min(srcData.height, _resizeH); const { data, width, height } = dataSet; const bitData = new Uint8Array(width * height); for (let i = 0; i < data.length; i++) { const v = (data[i] as number) > 0.3 ? 255 : 0; bitData[i] = v; } if (dev) { const clipData = new Uint8ClampedArray(width * height * 4); for (let i = 0; i < data.length; i++) { const n = i * 4; const v = (data[i] as number) > 0.3 ? 255 : 0; clipData[n] = clipData[n + 1] = clipData[n + 2] = v; clipData[n + 3] = 255; bitData[i] = v; } const myImageData = createImageData(clipData, width, height); const srcCanvas = data2canvas(myImageData); putImgDom(srcCanvas, "det_ru"); } task2.l("edge"); const edgeRect: detResultType = []; const src2: number[][] = []; for (let y = 0; y < height; y++) { src2.push(Array.from(bitData.slice(y * width, y * width + width))); } const contours2: Point[][] = []; findContours(src2, contours2); if (dev) { const xctx = (document.querySelector("#det_ru") as HTMLCanvasElement).getContext("2d")!; for (const item of contours2) { xctx.moveTo(item[0].x, item[0].y); for (const p of item) { xctx.lineTo(p.x, p.y); } xctx.strokeStyle = "red"; xctx.closePath(); xctx.stroke(); } } for (let i = 0; i < contours2.length; i++) { task2.l("get_box"); const minSize = 3; const l: Contour = contours2[i]; const { points, sside } = getMiniBoxes(l); if (sside < minSize) continue; // TODO sort fast const resultObj = unclip2(points); const box = resultObj.points; if (resultObj.sside < minSize + 2) { continue; } const rx = srcData.width / w; const ry = srcData.height / h; for (let i = 0; i < box.length; i++) { box[i][0] *= rx; box[i][1] *= ry; } task2.l("order"); const box1 = orderPointsClockwise(box); for (const item of box1) { item[0] = clip(Math.round(item[0]), 0, srcData.width); item[1] = clip(Math.round(item[1]), 0, srcData.height); } const rect_width = int(linalgNorm(box1[0], box1[1])); const rect_height = int(linalgNorm(box1[0], box1[3])); if (rect_width <= 3 || rect_height <= 3) continue; drawBox(box, "", "red", "det_ru"); task2.l("crop"); const c = getRotateCropImage(srcData, box); task2.l("match best"); const { bg, text } = getImgColor(c); const bb = matchBestBox(box, c, text); edgeRect.push({ box: bb, img: c, style: { bg, text } }); } task2.l("e"); log(edgeRect); return edgeRect; } function polygonPolygonArea(polygon: pointsType) { let i = -1; const n = polygon.length; let a: pointType; let b = polygon[n - 1]; let area = 0; while (++i < n) { a = b; b = polygon[i]; area += a[1] * b[0] - a[0] * b[1]; } return area / 2; } function polygonPolygonLength(polygon: pointsType) { let i = -1; const n = polygon.length; let b = polygon[n - 1]; let xa: number; let ya: number; let xb = b[0]; let yb = b[1]; let perimeter = 0; while (++i < n) { xa = xb; ya = yb; b = polygon[i]; xb = b[0]; yb = b[1]; xa -= xb; ya -= yb; perimeter += Math.hypot(xa, ya); } return perimeter; } function unclip2(box: pointsType) { const unclip_ratio = 1.5; const area = Math.abs(polygonPolygonArea(box)); const length = polygonPolygonLength(box); const distance = (area * unclip_ratio) / length; const expandedArr: pointType[] = []; for (const [i, p] of box.entries()) { const lastPoint = box.at((i - 1) % 4)!; const nextPoint = box.at((i + 1) % 4)!; const x1 = p[0] - lastPoint[0]; const y1 = p[1] - lastPoint[1]; const d1 = Math.sqrt(x1 ** 2 + y1 ** 2); const dx1 = (x1 / d1) * distance; const dy1 = (y1 / d1) * distance; const x2 = p[0] - nextPoint[0]; const y2 = p[1] - nextPoint[1]; const d2 = Math.sqrt(x2 ** 2 + y2 ** 2); const dx2 = (x2 / d2) * distance; const dy2 = (y2 / d2) * distance; expandedArr.push([p[0] + dx1 + dx2, p[1] + dy1 + dy2]); } const v1 = [expandedArr[0][0] - expandedArr[1][0], expandedArr[0][1] - expandedArr[1][1]]; const v2 = [expandedArr[2][0] - expandedArr[1][0], expandedArr[2][1] - expandedArr[1][1]]; const cross = v1[0] * v2[1] - v1[1] * v2[0]; return { points: expandedArr as BoxType, sside: Math.abs(cross) }; } function boxPoints(center: { x: number; y: number }, size: { width: number; height: number }, angle: number) { const width = size.width; const height = size.height; const theta = (angle * Math.PI) / 180.0; const cosTheta = Math.cos(theta); const sinTheta = Math.sin(theta); const cx = center.x; const cy = center.y; const dx = width * 0.5; const dy = height * 0.5; const rotatedPoints: [number, number][] = []; // Top-Left const x1 = cx - dx * cosTheta + dy * sinTheta; const y1 = cy - dx * sinTheta - dy * cosTheta; rotatedPoints.push([x1, y1]); // Top-Right const x2 = cx + dx * cosTheta + dy * sinTheta; const y2 = cy + dx * sinTheta - dy * cosTheta; rotatedPoints.push([x2, y2]); // Bottom-Right const x3 = cx + dx * cosTheta - dy * sinTheta; const y3 = cy + dx * sinTheta + dy * cosTheta; rotatedPoints.push([x3, y3]); // Bottom-Left const x4 = cx - dx * cosTheta - dy * sinTheta; const y4 = cy - dx * sinTheta + dy * cosTheta; rotatedPoints.push([x4, y4]); return rotatedPoints; } function getMiniBoxes(contour: Point[]) { const l = contour; const boundingBox = minAreaRect(l); const points = Array.from(boxPoints(boundingBox.center, boundingBox.size, boundingBox.angle)).sort( (a, b) => a[0] - b[0], ) as pointsType; let index_1 = 0; let index_2 = 1; let index_3 = 2; let index_4 = 3; if (points[1][1] > points[0][1]) { index_1 = 0; index_4 = 1; } else { index_1 = 1; index_4 = 0; } if (points[3][1] > points[2][1]) { index_2 = 2; index_3 = 3; } else { index_2 = 3; index_3 = 2; } const box = [points[index_1], points[index_2], points[index_3], points[index_4]] as BoxType; const side = Math.min(boundingBox.size.height, boundingBox.size.width); return { points: box, sside: side }; } function flatten(arr: number[] | number[][]) { return arr.flat(); } function linalgNorm(p0: pointType, p1: pointType) { return Math.sqrt((p0[0] - p1[0]) ** 2 + (p0[1] - p1[1]) ** 2); } function orderPointsClockwise(pts: BoxType) { const rect: BoxType = [ [0, 0], [0, 0], [0, 0], [0, 0], ]; const s = pts.map((pt) => pt[0] + pt[1]); rect[0] = pts[s.indexOf(Math.min(...s))]; rect[2] = pts[s.indexOf(Math.max(...s))]; const tmp = pts.filter((pt) => pt !== rect[0] && pt !== rect[2]); const diff = tmp[1].map((e, i) => e - tmp[0][i]); rect[1] = tmp[diff.indexOf(Math.min(...diff))]; rect[3] = tmp[diff.indexOf(Math.max(...diff))]; return rect; } function getRotateCropImage(img: ImageData, points: BoxType) { // todo 根据曲线裁切 const [p0, p1, p2, p3] = points.map((p) => ({ x: p[0], y: p[1] })); // 计算原始宽高 const width = Math.sqrt((p1.x - p0.x) ** 2 + (p1.y - p0.y) ** 2); const height = Math.sqrt((p3.x - p0.x) ** 2 + (p3.y - p0.y) ** 2); // 计算变换矩阵参数 const dx1 = p1.x - p0.x; const dy1 = p1.y - p0.y; const dx3 = p3.x - p0.x; const dy3 = p3.y - p0.y; const determinant = dx1 * dy3 - dx3 * dy1; if (determinant === 0) throw new Error("点共线,无法形成矩形"); const a = (width * dy3) / determinant; const c = (-dx3 * width) / determinant; const b = (-height * dy1) / determinant; const d = (dx1 * height) / determinant; const e = -a * p0.x - c * p0.y; const f = -b * p0.x - d * p0.y; const inputCanvas = data2canvas(img); // 创建输出Canvas const outputCanvas = newCanvas(Math.ceil(width), Math.ceil(height)); const ctx = outputCanvas.getContext("2d")!; // 应用变换并绘制 ctx.setTransform(a, b, c, d, e, f); ctx.drawImage(inputCanvas, 0, 0); // 重置变换以进行后续操作 ctx.resetTransform(); return ctx.getImageData(0, 0, outputCanvas.width, outputCanvas.height); } type color = [number, number, number]; function getImgColor(img: ImageData) { const histogram = new Map<string, number>(); const data = img.data; for (let i = 0; i < data.length; i += 4) { const x = (i / 4) % img.width; if (x > img.height * 4) continue; const r = data[i]; const g = data[i + 1]; const b = data[i + 2]; const colorKey = [r, g, b].join(","); histogram.set(colorKey, (histogram.get(colorKey) || 0) + 1); } const colorList = getHighestFrequency(histogram, 20).map((c) => ({ el: c.el.split(",").map(Number) as color, count: c.count, })); const bg = colorList.at(0)?.el || [255, 255, 255]; const textEdge = colorList.at(1)?.el || [0, 0, 0]; let text = textEdge; const colorD = 100; if (areColorsSimilar(textEdge, bg) < colorD) { const colorSplit = colorList.slice(1).filter((c) => areColorsSimilar(c.el, bg) > 50); if (colorSplit.length > 0) { text = [0, 1, 2] // rgb各自平均 .map((i) => Math.round(average2(colorSplit.map((c) => [c.el[i], c.count] as [number, number]))), ) as color; } if (colorSplit.length === 0 || areColorsSimilar(text, bg) < colorD) text = bg.map((x) => 255 - x) as color; logColor(`rgb(${text.join(",")})`); } return { bg: bg, text: text, textEdge: textEdge, }; } function areColorsSimilar(color1: color, color2: color) { const rgb1 = color1; const rgb2 = color2; const distance = Math.sqrt((rgb1[0] - rgb2[0]) ** 2 + (rgb1[1] - rgb2[1]) ** 2 + (rgb1[2] - rgb2[2]) ** 2); return distance; } function getHighestFrequency<t>(map: Map<t, number>, c = 1) { let l: { el: t; count: number }[] = []; map.forEach((count, name) => { if (l.length === 0) l.push({ el: name, count }); else { if (l.length < c) { l.push({ el: name, count }); } else if (l.find((i) => i.count <= count)) { l.push({ el: name, count }); } l.sort((a, b) => b.count - a.count); if (l.length > c) { l = l.slice(0, c); } } }); return l; } function matchBestBox(box: BoxType, img: ImageData, textEdgeColor: color) { let yFromTop = 0; let yFromBottom = img.height; let xFromLeft = 0; let xFromRight = img.width; function match(pix: color) { return areColorsSimilar(pix, textEdgeColor) < 200; } yt: for (let y = yFromTop; y < img.height; y++) { for (let x = 0; x < img.width; x++) { const pix = getImgPix(img, x, y); if (match(pix)) { yFromTop = y; break yt; } } } yb: for (let y = yFromBottom - 1; y >= 0; y--) { for (let x = 0; x < img.width; x++) { const pix = getImgPix(img, x, y); if (match(pix)) { yFromBottom = y; break yb; } } } xl: for (let x = xFromLeft; x < img.width; x++) { for (let y = yFromTop; y <= yFromBottom; y++) { const pix = getImgPix(img, x, y); if (match(pix)) { xFromLeft = x; break xl; } } } xr: for (let x = xFromRight - 1; x >= 0; x--) { for (let y = yFromTop; y <= yFromBottom; y++) { const pix = getImgPix(img, x, y); if (match(pix)) { xFromRight = x; break xr; } } } const dyT = clip(yFromTop - 1, 0, 4); const dyB = clip(img.height - yFromBottom - 1, 0, 4); const dxL = clip(xFromLeft - 1, 0, 4); const dxR = clip(img.width - xFromRight - 1, 0, 4); const newBox = [ [box[0][0] + dxL, box[0][1] + dyT], [box[1][0] - dxR, box[1][1] + dyT], [box[2][0] - dxR, box[2][1] - dyB], [box[3][0] + dxL, box[3][1] - dyB], ] as BoxType; return newBox; } function getImgPix(img: ImageData, x: number, y: number) { const index = (y * img.width + x) * 4; return Array.from(img.data.slice(index, index + 4)) as color; } function beforeRec(box: { box: BoxType; img: ImageData }[], imgH: number) { const l: { b: number[][][]; imgH: number; imgW: number }[] = []; function resizeNormImg(img: ImageData) { const w = Math.floor(imgH * (img.width / img.height)); const d = resizeImg(img, w, imgH, undefined, false); if (dev) putImgDom(data2canvas(d, w, imgH)); return { data: d, w, h: imgH }; } for (const r of box) { let img = r.img; // 模型只支持输入横的图片 if (img.width < img.height) { img = rotateImg(img, -90); } const reImg = resizeNormImg(img); l.push({ b: toPaddleInput(reImg.data, [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), imgH: reImg.h, imgW: reImg.w }); } log(l); return l; } function afterRec( data: AsyncType<ReturnType<typeof runRec>>, character: string[], op: { opm: { space: boolean; }; }, ) { const predLen = data.dims[2]; const line: { text: string; mean: number }[] = []; let ml = data.dims[0] - 1; function getChar(i: number) { return character.at(i - 1) ?? ""; } for (let l = 0; l < data.data.length; l += predLen * data.dims[1]) { const predsIdx: number[] = []; const predsProb: number[] = []; for (let i = l; i < l + predLen * data.dims[1]; i += predLen) { const tmpArr = data.data.slice(i, i + predLen) as Float32Array; let tmpMax = Number.NEGATIVE_INFINITY; let tmpIdx = -1; let tmpSecond = Number.NEGATIVE_INFINITY; let tmpSecondI = -1; for (let j = 0; j < tmpArr.length; j++) { const currentValue = tmpArr[j]; if (currentValue > tmpMax) { tmpSecond = tmpMax; tmpMax = currentValue; tmpIdx = j; } else if (currentValue > tmpSecond && currentValue < tmpMax) { tmpSecond = currentValue; tmpSecondI = j; } } if (op.opm.space) { if (tmpIdx === 0 && getChar(tmpSecondI) === " " && tmpSecond > 0.001) { tmpMax = tmpSecond; tmpIdx = tmpSecondI; } } predsProb.push(tmpMax); predsIdx.push(tmpIdx); } line[ml] = decode(predsIdx, predsProb); ml--; } function decode(textIndex: number[], textProb: number[]) { const charList: string[] = []; const confList: number[] = []; const isRemoveDuplicate = true; for (let idx = 0; idx < textIndex.length; idx++) { if (textIndex[idx] === 0) continue; if (isRemoveDuplicate) { if (idx > 0 && textIndex[idx - 1] === textIndex[idx]) { continue; } } charList.push(getChar(textIndex[idx])); confList.push(textProb[idx]); } let text = ""; let mean = 0; if (charList.length) { text = charList.join("").trim(); let sum = 0; for (const item of confList) { sum += item; } mean = sum / confList.length; } return { text, mean }; } return line; } /** 排版分析 */ function afAfRec( l: resultType, op?: { docDirs?: ReadingDir[]; columnsTip?: ColumnsTip }, ): { columns: { src: resultType; outerBox: BoxType; parragraphs: { src: resultType; parse: resultType[0]; }[]; }[]; parragraphs: resultType; readingDir: ReadingDir; angle: { reading: { inline: number; block: number }; angle: number }; } { log(l); type columnType = "none" | ColumnsTip[0]["type"]; // 假定阅读方向都是统一的 const dirs: ReadingDir[] = op?.docDirs ?? [ { block: "tb", inline: "lr" }, { block: "rl", inline: "tb" }, ]; const dir: ReadingDir = { block: "tb", inline: "lr" }; const dirVector = { inline: [1, 0] as VectorType, block: [0, 1] as VectorType, }; const baseVector = { inline: [1, 0] as VectorType, block: [0, 1] as VectorType, }; if (l.length === 0) { return { columns: [], parragraphs: [], readingDir: dir, angle: { reading: { inline: 0, block: 90 }, angle: 0 }, }; } const colTip: { box: BoxType; type: columnType }[] = [ { box: [ [Number.NEGATIVE_INFINITY, Number.NEGATIVE_INFINITY], [Number.POSITIVE_INFINITY, Number.NEGATIVE_INFINITY], [Number.POSITIVE_INFINITY, Number.POSITIVE_INFINITY], [Number.NEGATIVE_INFINITY, Number.POSITIVE_INFINITY], ], type: "none", }, ]; const defaultColId = 0; function findColId(b: BoxType) { const c = Box.center(b); for (let id = colTip.length - 1; id >= 0; id--) { const item = colTip[id]; const box = item.box; if (c[0] >= box[0][0] && c[0] <= box[1][0] && c[1] >= box[0][1] && c[1] <= box[3][1]) { return id; } } return defaultColId; } const Point = { center: (p1: pointType, p2: pointType): pointType => [(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2], disByV: (p1: pointType, p2: pointType, type: "block" | "inline") => { if (type === "block") { return Math.abs(Vector.dotMup(p1, baseVector.block) - Vector.dotMup(p2, baseVector.block)); } return Math.abs(Vector.dotMup(p1, baseVector.inline) - Vector.dotMup(p2, baseVector.inline)); }, compare: (a: pointType, b: pointType, type: "block" | "inline") => { if (type === "block") { return Vector.dotMup(a, baseVector.block) - Vector.dotMup(b, baseVector.block); } return Vector.dotMup(a, baseVector.inline) - Vector.dotMup(b, baseVector.inline); }, toInline: (p: pointType) => { return Vector.dotMup(p, baseVector.inline); }, toBlock: (p: pointType) => { return Vector.dotMup(p, baseVector.block); }, }; const Box = { inlineStart: (b: BoxType) => Point.center(b[0], b[3]), inlineEnd: (b: BoxType) => Point.center(b[1], b[2]), blockStart: (b: BoxType) => Point.center(b[0], b[1]), blockEnd: (b: BoxType) => Point.center(b[2], b[3]), inlineSize: (b: BoxType) => b[1][0] - b[0][0], blockSize: (b: BoxType) => b[3][1] - b[0][1], inlineStartDis: (a: BoxType, b: BoxType) => Point.disByV(a[0], b[0], "inline"), inlineEndDis: (a: BoxType, b: BoxType) => Point.disByV(a[1], b[1], "inline"), blockGap: (newB: BoxType, oldB: BoxType) => Point.disByV(newB[0], oldB[3], "block"), inlineCenter: (b: BoxType) => (b[2][0] + b[0][0]) / 2, blockCenter: (b: BoxType) => (b[2][1] + b[0][1]) / 2, inlineStartCenter: (b: BoxType) => Box.inlineStart(b), center: (b: BoxType) => Point.center(b[0], b[2]), }; type VectorType = [number, number]; const Vector = { fromPonts: (p1: pointType, p2: pointType): pointType => [p1[0] - p2[0], p1[1] - p2[1]], dotMup: (a: VectorType, b: VectorType) => a[0] * b[0] + a[1] * b[1], numMup: (a: VectorType, b: number) => [a[0] * b, a[1] * b] as VectorType, add: (a: VectorType, b: VectorType) => [a[0] + b[0], a[1] + b[1]] as VectorType, }; function averLineAngles(a: number[]) { let iav = 0; let n = 0; const l: number[] = []; for (const [index, i] of a.entries()) { const a1 = i > 180 ? i - 180 : i; const a2 = a1 - 180; const a = index === 0 ? a1 : Math.abs(a2 - iav) < Math.abs(a1 - iav) ? a2 : a1; l.push(a); iav = (iav * n + a) / (n + 1); n++; } return { av: iav, l }; } function lineAngleNear(a1: number, a2: number) { if (Math.abs(a1 - a2) < 45) return true; if (Math.abs(a1 - (a2 - 180)) < 45) return true; if (Math.abs(a1 - 180 - a2) < 45) return true; return false; } function median(l: number[]) { l.sort((a, b) => a - b); const mid = Math.floor(l.length / 2); return l.length % 2 === 0 ? (l[mid - 1] + l[mid]) / 2 : l[mid]; } function dir2xy(d: ReadingDirPart) { if (d === "lr" || d === "rl") return "x"; return "y"; } function smallest<I>(l: I[], f: (a: I) => number) { let min = Number.POSITIVE_INFINITY; let minIndex = -1; for (let i = 0; i < l.length; i++) { const v = f(l[i]); if (v < min) { min = v; minIndex = i; } } return l[minIndex]; } const tipV: Record<ReadingDirPart, VectorType> = { lr: [1, 0], rl: [-1, 0], tb: [0, 1], bt: [0, -1], }; /** 坐标系变换 */ function transXY(old: ReadingDir, target: ReadingDir) { const oX = tipV[old.inline]; const oY = tipV[old.block]; const tX = tipV[target.inline]; const tY = tipV[target.block]; const tInOX = [Vector.dotMup(tX, oX), Vector.dotMup(tX, oY)] as VectorType; const tInOY = [Vector.dotMup(tY, oX), Vector.dotMup(tY, oY)] as VectorType; return (p: pointType) => { return [Vector.dotMup(p, tInOX), Vector.dotMup(p, tInOY)] as pointType; }; } function transBox(old: ReadingDir, target: ReadingDir) { const t = transXY(old, target); return { b: (b: BoxType) => { for (const p of b) { const [a, b] = t(p); p[0] = a; p[1] = b; } }, p: t, }; } function reOrderBox(map: number[]) { return (b: BoxType) => { const newB: BoxType = [ [0, 0], [0, 0], [0, 0], [0, 0], ]; for (let i = 0; i < map.length; i++) { newB[i] = b[map[i]]; } return newB; }; } function r(point: pointType, point2: pointType) { return Math.sqrt((point[0] - point2[0]) ** 2 + (point[1] - point2[1]) ** 2); } function outerRect(boxes: BoxType[]) { const points = boxes.flatMap((i) => i.map((i) => i)); const x1 = Math.min(...points.map((p) => Vector.dotMup(p, baseVector.inline))); const x2 = Math.max(...points.map((p) => Vector.dotMup(p, baseVector.inline))); const y1 = Math.min(...points.map((p) => Vector.dotMup(p, baseVector.block))); const y2 = Math.max(...points.map((p) => Vector.dotMup(p, baseVector.block))); const o = Vector.add(Vector.numMup(baseVector.inline, x1), Vector.numMup(baseVector.block, y1)); const w = Vector.numMup(baseVector.inline, x2 - x1); const h = Vector.numMup(baseVector.block, y2 - y1); return [o, Vector.add(o, w), Vector.add(Vector.add(o, w), h), Vector.add(o, h)] as BoxType; } function pushColumn(b: resultType[0]) { let nearest: number | null = null; let _jl = Number.POSITIVE_INFINITY; for (const i in columns) { const last = columns[i].src.at(-1); if (!last) continue; const jl = r(b.box[0], last.box[0]); if (jl < _jl) { nearest = Number(i); _jl = jl; } } if (nearest === null) { columns.push({ src: [b] }); return; } const last = columns[nearest].src.at(-1) as resultType[0]; // 前面已经遍历过了,有-1的才能赋值到nearest const thisW = Box.inlineSize(b.box); const lastW = Box.inlineSize(last.box); const minW = Math.min(thisW, lastW); const em = Box.blockSize(b.box); if ( // 左右至少有一边是相近的,中心距离要相近 // 行之间也不要离太远 (Box.inlineStartDis(b.box, last.box) < 3 * em || Box.inlineEndDis(b.box, last.box) < 3 * em || Point.disByV(Box.center(b.box), Box.center(last.box), "inline") < minW * 0.4) && Box.blockGap(b.box, last.box) < em * 1.1 ) { } else { columns.push({ src: [b] }); return; } columns[nearest].src.push(b); } function joinResult(p: resultType) { const cjkv = /\p{Ideographic}/u; const cjkf = /[。,!?;:“”‘’《》、【】()…—]/; const res: resultType[0] = { box: outerRect(p.map((i) => i.box)), text: "", mean: average2(p.map((i) => [i.mean, i.text.length])), style: p[0].style, }; for (const i of p) { const lastChar = res.text.at(-1); if ( lastChar && ((!lastChar.match(cjkv) && !lastChar.match(cjkf)) || (!i.text.at(0)?.match(cjkv) && !i.text.at(0)?.match(cjkf))) ) res.text += " "; res.text += i.text; } return res satisfies resultType[0]; } function sortCol(cs: { src: resultType; outerBox: BoxType }[]) { // 重新排序 // 先按block排序,block相近的inline排序 cs.sort((a, b) => { const em = a.src.at(0) ? Box.blockSize(a.src.at(0)!.box) : 2; if (Point.disByV(Box.blockStart(a.outerBox), Box.blockStart(b.outerBox), "block") < em) { return Point.compare(Box.inlineStart(a.outerBox), Box.inlineStart(b.outerBox), "inline"); } return Point.compare(Box.blockStart(a.outerBox), Box.blockStart(b.outerBox), "block"); }); } if (op?.columnsTip) { for (const i of op.columnsTip) colTip.push(structuredClone(i)); } // 获取角度 竖排 横排 /** 以x轴为正方向,图形学坐标 */ const rAngle = { inline: 0, block: 90, }; const inlineAngles = l.map((i) => { const b = i.box; const w = b[1][0] - b[0][0]; const h = b[3][1] - b[0][1]; let v = { x: 0, y: 0 }; if (w < h) { const p = Vector.fromPonts(Point.center(b[2], b[3]), Point.center(b[0], b[1])); v = { x: p[0], y: p[1] }; } else { const p = Vector.fromPonts(Point.center(b[1], b[2]), Point.center(b[0], b[3])); v = { x: p[0], y: p[1] }; } const a = normalAngle(Math.atan2(v.y, v.x) * (180 / Math.PI)); return a; }); const firstAngleAnalysis = averLineAngles(inlineAngles); // 排除正交的 const filterAngles = inlineAngles.filter((i) => lineAngleNear(i, firstAngleAnalysis.av)); const md = median(filterAngles); const MAD = median(filterAngles.map((i) => Math.abs(i - md))); const filterAngles1 = filterAngles.filter((i) => Math.abs((i - md) / (MAD * 1.4826)) < 2); const inlineangle = normalAngle(averLineAngles(filterAngles1).av); log("dir0", inlineAngles, firstAngleAnalysis, filterAngles, filterAngles1, inlineangle); const blockangle = normalAngle(inlineangle + 90); const inlineDir = lineAngleNear(inlineangle, 0) ? "x" : "y"; const blockDir = lineAngleNear(blockangle, 90) ? "y" : "x"; const fdir = dirs.find((d) => inlineDir === dir2xy(d.inline) && blockDir === dir2xy(d.block)) ?? dirs.at(0); if (fdir) { dir.block = fdir.block; dir.inline = fdir.inline; } const tipAngle: Record<ReadingDirPart, number> = { lr: 0, rl: 180, tb: 90, bt: 270, }; rAngle.inline = smallest([inlineangle, inlineangle - 360, inlineangle - 180, inlineangle + 180], (a) => Math.abs(a - tipAngle[dir.inline]), ); rAngle.block = smallest([blockangle, blockangle - 360, blockangle - 180, blockangle + 180], (a) => Math.abs(a - tipAngle[dir.block]), ); dirVector.inline = [Math.cos(rAngle.inline * (Math.PI / 180)), Math.sin(rAngle.inline * (Math.PI / 180))]; dirVector.block = [Math.cos(rAngle.block * (Math.PI / 180)), Math.sin(rAngle.block * (Math.PI / 180))]; log("dir", dir, rAngle, dirVector, inlineangle, blockangle); // 按照阅读方向,把box内部点重新排序 const reOrderMapX = [ [dir.inline[0], dir.block[0]], [dir.inline[1], dir.block[0]], [dir.inline[1], dir.block[1]], [dir.inline[0], dir.block[1]], ]; const reOrderMap = reOrderMapX.map( ([i, b]) => ({ lt: 0, rt: 1, rb: 2, lb: 3, })[i === "l" || i === "r" ? i + b : b + i], ) as number[]; const xyT = transBox({ inline: "lr", block: "tb" }, dir); const reOrderBoxT = reOrderBox(reOrderMap); const logicL = l.map((i) => { const newBox = reOrderBoxT(i.box); xyT.b(newBox); return { ...i, box: newBox, }; }); for (const i of colTip) { i.box = reOrderBoxT(i.box); xyT.b(i.box); } // 不考虑整体旋转,只考虑倾斜 baseVector.inline = xyT.p(dirVector.inline); baseVector.block = xyT.p(dirVector.block); log("相对坐标系", baseVector); // 分析那些是同一水平的 const newL_ = logicL.sort((a, b) => Point.compare(Box.blockStart(a.box), Box.blockStart(b.box), "block")); const newLZ: { line: { src: resultType[0]; colId: number }[] }[] = []; for (const j of newL_) { const colId = findColId(j.box); const last = newLZ.at(-1)?.line.at(-1); if (!last) { newLZ.push({ line: [{ src: j, colId }] }); continue; } const thisC = Box.center(j.box); const lastC = Box.center(last.src.box); if (Point.disByV(thisC, lastC, "block") < 0.5 * Box.blockSize(j.box)) { const lLast = newLZ.at(-1); if (!lLast) { newLZ.push({ line: [{ src: j, colId }] }); } else { lLast.line.push({ src: j, colId }); } } else { newLZ.push({ line: [{ src: j, colId }] }); } } // 根据距离,合并或保持拆分 // 有些近,是同一行;有些远,但在水平线上,说明是其他栏的 const newL: { src: resultType[0]; colId: number }[] = []; for (const l of newLZ) { if (l.line.length === 1) { newL.push({ src: l.line[0].src, colId: l.line[0].colId }); continue; } const em = average(l.line.map((i) => Box.blockSize(i.src.box))); l.line.sort((a, b) => Point.compare(Box.inlineStart(a.src.box), Box.inlineStart(b.src.box), "inline")); let last = l.line.at(0)!; for (const this_ of l.line.slice(1)) { const lastBoxInlineEnd = Box.inlineEnd(last.src.box); const thisInlineStart = Box.inlineStart(this_.src.box); if ( colTip[this_.colId].type === "table" || this_.colId !== last.colId || Point.toInline(thisInlineStart) - Point.toInline(lastBoxInlineEnd) > em ) { newL.push({ ...last }); last = this_; } else { last.src.text += this_.src.text; last.src.mean = (last.src.mean + this_.src.mean) / 2; last.src.box = outerRect([last.src.box, this_.src.box]); } } newL.push({ ...last }); } // todo 分割线为边界 // 分栏 // 按很细的粒度去分栏 const columns: { src: resultType }[] = []; const defaultNewL: typeof newL = []; const noDefaultColumns: { src: resultType; type: columnType; colId: number }[] = []; for (const l of newL) { if (l.colId === defaultColId) { defaultNewL.push(l); } else { const col = noDefaultColumns.find((i) => i.colId === l.colId); if (col) { col.src.push(l.src); } else { noDefaultColumns.push({ src: [l.src], type: colTip[l.colId].type, colId: l.colId }); } } } defaultNewL.sort((a, b) => Point.compare(Box.blockStart(a.src.box), Box.blockStart(b.src.box), "block")); for (const b of defaultNewL) { pushColumn(b.src); } // 合并栏,合并上面细粒度的 const columnsInYaxis: { smallCol: { src: resultType; outerBox: BoxType; x: number; w: number }[]; }[] = []; for (const [i, col] of columns.entries()) { const c = col.src; const outer = outerRect(c.map((b) => b.box)); const x = Box.blockCenter(outer); const w = Box.inlineSize(outer); if (i === 0) { columnsInYaxis.push({ smallCol: [{ src: c, outerBox: outer, x, w }] }); continue; } const l = columnsInYaxis.find((oc) => { const r = oc.smallCol.at(-1)!; const em = Box.blockSize(c.at(0)!.box); // 这里还是很严格,所以需要下面的标题合并、末尾合并、和交错合并 if ( Box.inlineStartDis(r.outerBox, outer) < 3 * em && Box.inlineEndDis(r.outerBox, outer) < 3 * em && Box.blockGap(outer, r.outerBox) < em * 2.1 ) return true; return false; }); if (l) { l.smallCol.push({ src: c, outerBox: outer, x, w }); } else { columnsInYaxis.push({ smallCol: [{ src: c, outerBox: outer, x, w }] }); } } for (const y of columnsInYaxis) { y.smallCol.sort((a, b) => Point.compare(Box.blockStart(a.outerBox), Box.blockStart(b.outerBox), "block")); } for (const c of noDefaultColumns) { c.src.sort((a, b) => Point.compare(Box.blockStart(a.box), Box.blockStart(b.box), "block")); } // columnsInYaxis新的表达形式,结构没变 const newColumns: { src: resultType; outerBox: BoxType; type: columnType }[] = []; for (const c of columnsInYaxis) { const o = outerRect(c.smallCol.map((i) => i.outerBox)); const s = c.smallCol.flatMap((i) => i.src); newColumns.push({ src: s, outerBox: o, type: "none" }); } sortCol(newColumns); // 宽度相近的行都合并了,但有两种不合并的,以行20字为例子:(1)20,20,2,20,20 (2)20,20,10,10,10,10,20] // 分别为段末和分栏 // 合并情况:中间短的行数多 const mergedColumns: typeof newColumns = []; for (const c of newColumns) { const last = mergedColumns.at(-1); if (!last) { mergedColumns.push(c); continue; } if (last.type !== "none") { mergedColumns.push(c); continue; } const lastOuter = last.outerBox; const em = Box.blockSize(c.src[0].box); if ( (last.src.length === 1 && Box.inlineStartDis(lastOuter, c.out