UNPKG

wn-ts-node

Version:
635 lines (634 loc) 22.3 kB
import v from "fs"; import b from "fs/promises"; import { join as G } from "path"; import I from "sax"; import "./logger-ClUC0kzz.js"; class w extends Error { constructor(o, s, n) { super(o), this.code = s, this.context = n, this.name = "LMFParseError"; } } class R { constructor(o) { this.config = o; } statistics = { wordsDeduplicated: 0, synsetsDeduplicated: 0, sensesDeduplicated: 0, totalDuplicates: 0 }; /** * Handle duplicates according to the configured strategy */ handleDuplicates(o, s) { if (this.config.strategy === "skip") return o; const n = this.config.uniqueKeys?.[s]; if (!n || n.length === 0) return o; const t = /* @__PURE__ */ new Map(), f = []; for (const i of o) { const a = this.generateUniqueKey(i, n, s); if (t.has(a)) { f.push(i), this.statistics.totalDuplicates++, this.config.logDuplicates && console.debug(`Duplicate ${s} found:`, { key: a, itemId: i.id }); const h = t.get(a); switch (this.config.strategy) { case "keep-first": break; case "keep-last": t.set(a, i); break; case "merge": const y = this.mergeItems(h, i, s); t.set(a, y); break; case "error": throw new w( `Duplicate ${s} found with key: ${a}`, "DUPLICATE_FOUND", { key: a, itemId: i.id, type: s } ); } } else t.set(a, i); } switch (s) { case "words": this.statistics.wordsDeduplicated = f.length; break; case "synsets": this.statistics.synsetsDeduplicated = f.length; break; case "senses": this.statistics.sensesDeduplicated = f.length; break; } return Array.from(t.values()); } /** * Generate a unique key for an item based on the specified unique key fields */ generateUniqueKey(o, s, n) { const t = []; for (const f of s) switch (f) { case "id": t.push(o.id || ""); break; case "lemma": n === "words" && t.push(o.lemma || ""); break; case "index": n === "words" && t.push(o.index || ""); break; case "pos": n === "words" && t.push(o.pos || ""); break; case "ili": n === "synsets" && t.push(o.ili || ""); break; case "wordId-synsetId": if (n === "senses") { const i = o; t.push(i.wordId || ""), t.push(i.synsetId || ""); } break; } return t.filter(Boolean).join("::"); } /** * Merge two items according to the merge strategy */ mergeItems(o, s, n) { if (this.config.strategy !== "merge") return o; const t = { ...o }, f = this.config.mergeFields; if (n === "words" && f?.forms) { const i = o, a = s; "forms" in i && "forms" in a && (t.forms = [...i.forms, ...a.forms]); } if (n === "synsets") { const i = o, a = s; f?.definitions && "definitions" in i && "definitions" in a && (t.definitions = [...i.definitions, ...a.definitions]), f?.examples && "examples" in i && "examples" in a && (t.examples = [...i.examples, ...a.examples]), f?.relations && "relations" in i && "relations" in a && (t.relations = [...i.relations, ...a.relations]); } if (n === "senses") { const i = o, a = s; f?.examples && "examples" in i && "examples" in a && (t.examples = [...i.examples, ...a.examples]), f?.tags && "tags" in i && "tags" in a && (t.tags = [...i.tags, ...a.tags]), f?.counts && "counts" in i && "counts" in a && (t.counts = [...i.counts, ...a.counts]); } return t; } /** * Get duplicate handling statistics */ getStatistics() { return { ...this.statistics }; } /** * Reset statistics */ resetStatistics() { this.statistics = { wordsDeduplicated: 0, synsetsDeduplicated: 0, sensesDeduplicated: 0, totalDuplicates: 0 }; } } function P(r, o = !1) { if (typeof r != "string") throw new w( "XML content is not a valid string", "INVALID_CONTENT_TYPE", { contentType: typeof r } ); if (r.trim().length === 0) throw new w( "XML content is empty", "EMPTY_CONTENT", { contentLength: r.length } ); const s = r.trim(); if (s.toLowerCase().includes("<!doctype html>") || s.toLowerCase().includes("<html") || s.toLowerCase().includes("error") && s.toLowerCase().includes("not found") && (s.toLowerCase().includes("http error") || s.toLowerCase().includes("error 404") || s.toLowerCase().includes("error 500") || s.toLowerCase().includes("error 403"))) throw new w( "Content appears to be HTML error page, not XML", "HTML_ERROR_PAGE", { hasDoctype: s.toLowerCase().includes("<!doctype html>"), hasHtml: s.toLowerCase().includes("<html"), hasError: s.toLowerCase().includes("error") } ); if (s.toLowerCase().includes("http") && !s.toLowerCase().includes("<!doctype") && (s.toLowerCase().includes("404") || s.toLowerCase().includes("500") || s.toLowerCase().includes("403"))) throw new w( "Server returned HTTP error page", "HTTP_ERROR_RESPONSE", { has404: s.toLowerCase().includes("404"), has500: s.toLowerCase().includes("500"), has403: s.toLowerCase().includes("403") } ); if (!s.startsWith("<?xml") && !s.startsWith("<")) throw new w( "Content does not appear to be XML", "NOT_XML", { startsWithXml: s.startsWith("<?xml"), startsWithTag: s.startsWith("<"), firstChars: s.substring(0, 50) } ); if (!s.includes("<LexicalResource")) throw new w( "missing LexicalResource element", "MISSING_LEXICAL_RESOURCE", { hasLexicalResource: s.includes("<LexicalResource"), firstChars: s.substring(0, 200) } ); const n = (s.match(/</g) || []).length, t = (s.match(/>/g) || []).length; if (n !== t) throw new w( "Malformed XML - mismatched tags", "MALFORMED_XML", { openTags: n, closeTags: t, difference: Math.abs(n - t) } ); o && (console.log("[DEBUG] Enhanced XML content validation passed"), console.log(`[DEBUG] Content length: ${r.length}`), console.log("[DEBUG] First 200 characters:", s.substring(0, 200))); } const N = { strategy: "keep-first", mergeFields: { definitions: !0, examples: !0, relations: !0, forms: !0, pronunciations: !0, tags: !0, counts: !0 }, uniqueKeys: { words: ["id"], synsets: ["id"], senses: ["id"] }, logDuplicates: !1, trackStatistics: !0 }; function A(r, o = N) { const s = new R(o); return { ...r, words: s.handleDuplicates(r.words, "words"), synsets: s.handleDuplicates(r.synsets, "synsets"), senses: s.handleDuplicates(r.senses, "senses") }; } function M(r) { if (r.startsWith("http://") || r.startsWith("https://") || r.startsWith("ftp://")) return !0; if (r.includes("://")) try { return new URL(r), !0; } catch { return !1; } return !1; } const { readFile: C, stat: _ } = b, { createReadStream: S } = v, x = /* @__PURE__ */ new Set(["1.0", "1.1", "1.2", "1.3", "1.4"]), W = /<!DOCTYPE LexicalResource SYSTEM "([^"]+)">/, O = { "1.0": "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd", "1.1": "http://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd", "1.2": "http://globalwordnet.github.io/schemas/WN-LMF-1.2.dtd", "1.3": "http://globalwordnet.github.io/schemas/WN-LMF-1.3.dtd", "1.4": "http://globalwordnet.github.io/schemas/WN-LMF-1.4.dtd" }; async function j(r) { return new Promise((o) => { const s = S(r, { encoding: "utf-8", start: 0, end: 1024 }); let n = ""; s.on("data", (t) => { n += t; }), s.on("end", () => { o( n.includes('<?xml version="1.0"') && n.includes("<!DOCTYPE LexicalResource") && n.includes("<LexicalResource") ); }), s.on("error", () => { o(!1); }); }); } async function H(r, o = !1) { o && console.log("[DEBUG] Quick scanning file for version and element count..."); const s = await C(r, "utf-8"); let n = "1.0"; const t = s.match(W); if (t?.[1]) { const i = t[1]; o && console.log(`[DEBUG] Found DOCTYPE with schema: ${i}`); let a = !1; for (const [h, y] of Object.entries(O)) if (y === i) { n = h, a = !0, o && console.log(`[DEBUG] Matched schema URL to supported version: ${n}`); break; } if (!a) { const h = i.match(/WN-LMF-([0-9]+\.[0-9]+)\.dtd$/); h && h[1] && (n = h[1], o && console.log(`[DEBUG] Extracted unsupported version from schema URL: ${n}`)); } } else o && console.log(`[DEBUG] No DOCTYPE pattern found, using default version: ${n}`); const f = (s.match(/<\/[^>]+>/g) || []).length + (s.match(/\/>/g) || []).length; return o && console.log(`[DEBUG] Quick scan: version=${n}, estimated elements=${f}`), { version: n, elementCount: f }; } async function X(r, o = {}) { const { debug: s = !1 } = o; s && console.log(`[DEBUG] Loading LMF from URL: ${r}`); try { const n = await fetch(r); if (!n.ok) throw new Error(`HTTP ${n.status}: ${n.statusText}`); const t = await n.text(); return s && console.log(`[DEBUG] Loaded ${t.length} characters from URL`), t; } catch (n) { throw s && console.log("[DEBUG] URL loading failed:", n), new Error(`Failed to load LMF from URL: ${n instanceof Error ? n.message : "Unknown error"}`); } } async function J(r, o = {}) { const { debug: s = !1 } = o; s && console.log(`[DEBUG] loadLMF() starting for: ${r}`); try { let n, t; const f = M(r); if (s && console.log(`[DEBUG] isURL(${r}) = ${f}`), f) { s && console.log("[DEBUG] Input is URL, loading from network"), n = await X(r, o); const p = await b.mkdtemp(G(require("os").tmpdir(), "wn-ts-lmf-")); t = G(p, "temp.lmf"), await b.writeFile(t, n, "utf-8"), s && console.log(`[DEBUG] Created temporary file: ${t}`); } else if (t = r, s) { const p = await _(t), U = (p.size / (1024 * 1024)).toFixed(2); console.log(`[DEBUG] File size: ${U} MB (${p.size.toLocaleString()} bytes)`); } const { version: i, elementCount: a } = await H(t, s); if (s && console.log(`[DEBUG] Quick scan returned version: ${i}`), s && console.log(`[DEBUG] Supported versions: ${Array.from(x).join(", ")}`), s && console.log(`[DEBUG] Version ${i} supported: ${x.has(i)}`), !x.has(i)) throw s && console.log(`[DEBUG] Throwing error for unsupported version: ${i}`), new w(`Unsupported LMF version: ${i}`, "UNSUPPORTED_VERSION", { version: i }); try { n || (n = await C(t, "utf-8")), P(n, s); } catch (p) { throw p instanceof w ? new Error(`Failed to load LMF file: ${p.message}`) : new Error(`Failed to load LMF file: ${p instanceof Error ? p.message : "Unknown error"}`); } s && console.log(`[DEBUG] Using streaming parser for version ${i}...`); const h = Date.now(), y = await q(t, i, a, o), $ = Date.now() - h; if (s && console.log(`[DEBUG] loadLMF() completed in ${$}ms total`), M(r) && t !== r) try { await b.unlink(t), await b.rmdir(require("path").dirname(t)), s && console.log(`[DEBUG] Cleaned up temporary file: ${t}`); } catch (p) { s && console.log(`[DEBUG] Failed to clean up temporary file: ${p}`); } return y; } catch (n) { throw s && console.log("[DEBUG] loadLMF() error:", n), new Error(`Failed to load LMF file: ${n}`); } } async function q(r, o, s, n = {}) { const { debug: t = !1, progress: f } = n; return new Promise((i, a) => { const h = [], y = [], $ = [], p = [], U = /* @__PURE__ */ new Map(); let g = null, l = null, c = null, u = null, L = null, D = 0, F = 0; const k = I.createStream(!0, { trim: !0, normalize: !0, lowercase: !0, position: !1, xmlns: !1 }), T = () => { if (f) { const d = s < 100 ? 2 : 1e3; if (D - F >= d) { const e = Math.min(D / s, 0.95); t && console.log(`[DEBUG] Progress update: ${D}/${s} = ${e}`), f(e), F = D; } } }; k.on("opentag", (d) => { D++, T(); const { attributes: e } = d, m = d.name.toLowerCase(); switch (t && (D % 1e4 === 0 || ["lexicalresource", "lexicon", "lexicalentry"].includes(m)) && console.log(`[DEBUG] Processing tag: ${m} (element #${D})`), t && ["synset", "lexicalentry"].includes(m) && D % 5e3 === 0 && console.log(`[DEBUG] Processing ${m} #${D} - current progress: ${Math.round(D / s * 100)}%`), m) { case "lexicalresource": t && console.log(`[DEBUG] Starting to parse LexicalResource (version: ${o})`); break; case "lexicon": g = { id: e.id || "", label: e.label || "", language: e.language || "en", version: e.version || "1.0", email: e.email || "", license: e.license || "", url: e.url, citation: e.citation, logo: e.logo, entries: [], synsets: [], frames: [] }, t && console.log(`[DEBUG] Processing lexicon: ${g.id}`); break; case "lexiconextension": g = { id: e.id || "", label: e.label || "", language: e.language || "en", version: e.version || "1.0", email: e.email || "", license: e.license || "", url: e.url, citation: e.citation, logo: e.logo, entries: [], synsets: [], frames: [] }, t && console.log(`[DEBUG] Processing lexicon extension: ${g.id}`); break; case "lexicalentry": l = { id: e.id || "unknown-word", lemma: "unknown", partOfSpeech: "n", language: g?.language || "en", lexicon: g?.id || "unknown", forms: [], tags: [], pronunciations: [], counts: [], senses: [], frames: [] }, t && console.log(`[DEBUG] Created word: ${l.id}`); break; case "lemma": l && (l.lemma = e.writtenform || e.writtenForm || "unknown", l.pos = e.partofspeech || e.partOfSpeech || "n", t && console.log(`[DEBUG] Set lemma for word ${l.id}: ${l.lemma} (${l.pos})`)); break; case "form": l && l.forms.push({ id: e.id || "", writtenForm: e.writtenform || e.writtenForm || "", script: e.script || "", tag: e.tag || "" }); break; case "sense": u = { id: e.id || "unknown-sense", wordId: l?.id || "unknown", synsetId: e.synset || "unknown-synset", counts: [], examples: [], tags: [] }; break; case "synset": c = { id: e.id || "unknown-synset", pos: e.pos || e.partofspeech || e.partOfSpeech || "n", definitions: [], examples: [], relations: [], language: g?.language || "en", lexicon: g?.id || "unknown", memberIds: [], senseIds: [] }, e.ili && (c.ili = e.ili); break; case "definition": if (c) { const E = e.id || `${c.id}-def-${c.definitions.length + 1}`; c.definitions.push({ id: E, language: e.language || g?.language || "en", text: "", source: e.source || "" }); } break; case "synsetrelation": c && c.relations.push({ id: e.id || "", type: e.reltype || e.relType || e.type || "unknown", target: e.target || "", source: e.source || "" }); break; case "tag": l ? l.tags.push({ id: e.id || "", category: e.category || "", value: "" }) : u && u.tags.push({ id: e.id || "", category: e.category || "", value: "" }); break; case "count": u && u.counts.push({ id: e.id || "", value: 0, writtenForm: "", pos: "n" }); break; case "pronunciation": if (l?.forms.length) { const E = l.forms[l.forms.length - 1]; E && (E.pronunciations || (E.pronunciations = []), E.pronunciations.push({ id: e.id || "", variety: e.variety || "", text: "", source: e.source || "" })); } break; case "syntacticbehaviour": l && l.frames.push({ id: e.id || "", subcategorizationFrame: e.subcategorizationframe || e.subcategorizationFrame || "", source: e.source || "", senses: e.senses || "" }); break; case "senserelation": u && (u.relations || (u.relations = []), u.relations.push({ id: e.id || "", type: e.reltype || e.relType || e.type || "unknown", target: e.target || "", dcType: e.dctype || e.dc_type || "" })); break; case "ilidefinition": c && (c.iliDefinitions || (c.iliDefinitions = []), c.iliDefinitions.push({ id: e.id || "", text: "" })); break; case "example": L = { id: e.id || "", language: e.language || g?.language || "en", text: "", source: e.source || "" }; break; } }), k.on("text", (d) => { if (c?.definitions.length) { const e = c.definitions[c.definitions.length - 1]; e && e.text === "" && (e.text = d.trim()); } if (L && L.text === "" && (L.text = d.trim()), c && c.iliDefinitions?.length) { const e = c.iliDefinitions[c.iliDefinitions.length - 1]; e && e.text === "" && (e.text = d.trim()); } if (l?.tags.length) { const e = l.tags[l.tags.length - 1]; e && e.value === "" && (e.value = d.trim()); } if (u?.tags.length) { const e = u.tags[u.tags.length - 1]; e && e.value === "" && (e.value = d.trim()); } if (u?.counts.length) { const e = u.counts[u.counts.length - 1]; if (e && e.writtenForm === "") { e.writtenForm = d.trim(); const m = parseInt(d.trim()); isNaN(m) || (e.value = m); } } if (l?.forms.length) { const e = l.forms[l.forms.length - 1]; if (e?.pronunciations?.length) { const m = e.pronunciations[e.pronunciations.length - 1]; m && m.text === "" && (m.text = d.trim()); } } }), k.on("closetag", (d) => { switch (d.toLowerCase()) { case "lexicalentry": l && g && (g.entries.push(l), $.push(l), t && console.log(`[DEBUG] Added entry to lexicon: ${l.id}`), l = null); break; case "sense": u && l && (l.senses.push(u), p.push(u), U.has(u.synsetId) || U.set(u.synsetId, []), U.get(u.synsetId).push(u), t && console.log(`[DEBUG] Added sense to entry: ${u.id}`), u = null); break; case "synset": if (c && g) { const m = U.get(c.id) || []; c.senseIds = m.map((E) => E.id), c.memberIds = m.map((E) => E.wordId), g.synsets.push(c), y.push(c), t && console.log(`[DEBUG] Added synset to lexicon: ${c.id}`), c = null; } break; case "lexicon": g && (h.push(g), t && console.log(`[DEBUG] Added lexicon: ${g.id}`), g = null); break; case "lexiconextension": g && (h.push(g), t && console.log(`[DEBUG] Added lexicon extension: ${g.id}`), g = null); break; case "lexicalresource": break; case "example": L && (c ? c.examples.push(L) : u && u.examples.push(L), L = null); break; } }), k.on("end", () => { t && console.log("[DEBUG] Stream ended, completing parsing"), f && f(1); let d = { lmfVersion: o, lexicons: h.map((e) => ({ id: e.id, label: e.label, language: e.language, email: e.email, license: e.license, version: e.version, url: e.url, citation: e.citation, logo: e.logo, requires: e.requires, metadata: e.metadata })), synsets: y, words: $, senses: p }; if (n.duplicateHandling && n.duplicateHandling.strategy !== "skip") { t && (console.log(`[DEBUG] Applying duplicate handling with strategy: ${n.duplicateHandling.strategy}`), console.log(`[DEBUG] Before deduplication - words: ${$.length}, synsets: ${y.length}, senses: ${p.length}`)); try { d = A(d, n.duplicateHandling), t && console.log(`[DEBUG] After deduplication - words: ${d.words.length}, synsets: ${d.synsets.length}, senses: ${d.senses.length}`); } catch (e) { if (e instanceof w) { a(e); return; } a(new w( `Duplicate handling failed: ${e instanceof Error ? e.message : "Unknown error"}`, "DUPLICATE_HANDLING_FAILED", { originalError: e } )); return; } } i(d); }), k.on("error", (d) => { t && console.log("[DEBUG] Parser error:", d), a(new Error(`XML parsing error: ${d.message}`)); }); const B = S(r, { encoding: "utf8" }); t && console.log(`[DEBUG] Created read stream for ${r}`), B.pipe(k), B.on("error", (d) => { t && console.log("[DEBUG] Stream error:", d), a(new Error(`File stream error: ${d.message}`)); }); }); } export { j as isLMF, J as loadLMF };