wn-ts-node
Version:
Wordnet interface library - TypeScript port
635 lines (634 loc) • 22.3 kB
JavaScript
import v from "fs";
import b from "fs/promises";
import { join as G } from "path";
import I from "sax";
import "./logger-ClUC0kzz.js";
class w extends Error {
constructor(o, s, n) {
super(o), this.code = s, this.context = n, this.name = "LMFParseError";
}
}
class R {
constructor(o) {
this.config = o;
}
statistics = {
wordsDeduplicated: 0,
synsetsDeduplicated: 0,
sensesDeduplicated: 0,
totalDuplicates: 0
};
/**
* Handle duplicates according to the configured strategy
*/
handleDuplicates(o, s) {
if (this.config.strategy === "skip")
return o;
const n = this.config.uniqueKeys?.[s];
if (!n || n.length === 0)
return o;
const t = /* @__PURE__ */ new Map(), f = [];
for (const i of o) {
const a = this.generateUniqueKey(i, n, s);
if (t.has(a)) {
f.push(i), this.statistics.totalDuplicates++, this.config.logDuplicates && console.debug(`Duplicate ${s} found:`, { key: a, itemId: i.id });
const h = t.get(a);
switch (this.config.strategy) {
case "keep-first":
break;
case "keep-last":
t.set(a, i);
break;
case "merge":
const y = this.mergeItems(h, i, s);
t.set(a, y);
break;
case "error":
throw new w(
`Duplicate ${s} found with key: ${a}`,
"DUPLICATE_FOUND",
{ key: a, itemId: i.id, type: s }
);
}
} else
t.set(a, i);
}
switch (s) {
case "words":
this.statistics.wordsDeduplicated = f.length;
break;
case "synsets":
this.statistics.synsetsDeduplicated = f.length;
break;
case "senses":
this.statistics.sensesDeduplicated = f.length;
break;
}
return Array.from(t.values());
}
/**
* Generate a unique key for an item based on the specified unique key fields
*/
generateUniqueKey(o, s, n) {
const t = [];
for (const f of s)
switch (f) {
case "id":
t.push(o.id || "");
break;
case "lemma":
n === "words" && t.push(o.lemma || "");
break;
case "index":
n === "words" && t.push(o.index || "");
break;
case "pos":
n === "words" && t.push(o.pos || "");
break;
case "ili":
n === "synsets" && t.push(o.ili || "");
break;
case "wordId-synsetId":
if (n === "senses") {
const i = o;
t.push(i.wordId || ""), t.push(i.synsetId || "");
}
break;
}
return t.filter(Boolean).join("::");
}
/**
* Merge two items according to the merge strategy
*/
mergeItems(o, s, n) {
if (this.config.strategy !== "merge")
return o;
const t = { ...o }, f = this.config.mergeFields;
if (n === "words" && f?.forms) {
const i = o, a = s;
"forms" in i && "forms" in a && (t.forms = [...i.forms, ...a.forms]);
}
if (n === "synsets") {
const i = o, a = s;
f?.definitions && "definitions" in i && "definitions" in a && (t.definitions = [...i.definitions, ...a.definitions]), f?.examples && "examples" in i && "examples" in a && (t.examples = [...i.examples, ...a.examples]), f?.relations && "relations" in i && "relations" in a && (t.relations = [...i.relations, ...a.relations]);
}
if (n === "senses") {
const i = o, a = s;
f?.examples && "examples" in i && "examples" in a && (t.examples = [...i.examples, ...a.examples]), f?.tags && "tags" in i && "tags" in a && (t.tags = [...i.tags, ...a.tags]), f?.counts && "counts" in i && "counts" in a && (t.counts = [...i.counts, ...a.counts]);
}
return t;
}
/**
* Get duplicate handling statistics
*/
getStatistics() {
return { ...this.statistics };
}
/**
* Reset statistics
*/
resetStatistics() {
this.statistics = {
wordsDeduplicated: 0,
synsetsDeduplicated: 0,
sensesDeduplicated: 0,
totalDuplicates: 0
};
}
}
function P(r, o = !1) {
if (typeof r != "string")
throw new w(
"XML content is not a valid string",
"INVALID_CONTENT_TYPE",
{ contentType: typeof r }
);
if (r.trim().length === 0)
throw new w(
"XML content is empty",
"EMPTY_CONTENT",
{ contentLength: r.length }
);
const s = r.trim();
if (s.toLowerCase().includes("<!doctype html>") || s.toLowerCase().includes("<html") || s.toLowerCase().includes("error") && s.toLowerCase().includes("not found") && (s.toLowerCase().includes("http error") || s.toLowerCase().includes("error 404") || s.toLowerCase().includes("error 500") || s.toLowerCase().includes("error 403")))
throw new w(
"Content appears to be HTML error page, not XML",
"HTML_ERROR_PAGE",
{
hasDoctype: s.toLowerCase().includes("<!doctype html>"),
hasHtml: s.toLowerCase().includes("<html"),
hasError: s.toLowerCase().includes("error")
}
);
if (s.toLowerCase().includes("http") && !s.toLowerCase().includes("<!doctype") && (s.toLowerCase().includes("404") || s.toLowerCase().includes("500") || s.toLowerCase().includes("403")))
throw new w(
"Server returned HTTP error page",
"HTTP_ERROR_RESPONSE",
{
has404: s.toLowerCase().includes("404"),
has500: s.toLowerCase().includes("500"),
has403: s.toLowerCase().includes("403")
}
);
if (!s.startsWith("<?xml") && !s.startsWith("<"))
throw new w(
"Content does not appear to be XML",
"NOT_XML",
{
startsWithXml: s.startsWith("<?xml"),
startsWithTag: s.startsWith("<"),
firstChars: s.substring(0, 50)
}
);
if (!s.includes("<LexicalResource"))
throw new w(
"missing LexicalResource element",
"MISSING_LEXICAL_RESOURCE",
{
hasLexicalResource: s.includes("<LexicalResource"),
firstChars: s.substring(0, 200)
}
);
const n = (s.match(/</g) || []).length, t = (s.match(/>/g) || []).length;
if (n !== t)
throw new w(
"Malformed XML - mismatched tags",
"MALFORMED_XML",
{
openTags: n,
closeTags: t,
difference: Math.abs(n - t)
}
);
o && (console.log("[DEBUG] Enhanced XML content validation passed"), console.log(`[DEBUG] Content length: ${r.length}`), console.log("[DEBUG] First 200 characters:", s.substring(0, 200)));
}
const N = {
strategy: "keep-first",
mergeFields: {
definitions: !0,
examples: !0,
relations: !0,
forms: !0,
pronunciations: !0,
tags: !0,
counts: !0
},
uniqueKeys: {
words: ["id"],
synsets: ["id"],
senses: ["id"]
},
logDuplicates: !1,
trackStatistics: !0
};
function A(r, o = N) {
const s = new R(o);
return {
...r,
words: s.handleDuplicates(r.words, "words"),
synsets: s.handleDuplicates(r.synsets, "synsets"),
senses: s.handleDuplicates(r.senses, "senses")
};
}
function M(r) {
if (r.startsWith("http://") || r.startsWith("https://") || r.startsWith("ftp://"))
return !0;
if (r.includes("://"))
try {
return new URL(r), !0;
} catch {
return !1;
}
return !1;
}
const { readFile: C, stat: _ } = b, { createReadStream: S } = v, x = /* @__PURE__ */ new Set(["1.0", "1.1", "1.2", "1.3", "1.4"]), W = /<!DOCTYPE LexicalResource SYSTEM "([^"]+)">/, O = {
"1.0": "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd",
"1.1": "http://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd",
"1.2": "http://globalwordnet.github.io/schemas/WN-LMF-1.2.dtd",
"1.3": "http://globalwordnet.github.io/schemas/WN-LMF-1.3.dtd",
"1.4": "http://globalwordnet.github.io/schemas/WN-LMF-1.4.dtd"
};
async function j(r) {
return new Promise((o) => {
const s = S(r, { encoding: "utf-8", start: 0, end: 1024 });
let n = "";
s.on("data", (t) => {
n += t;
}), s.on("end", () => {
o(
n.includes('<?xml version="1.0"') && n.includes("<!DOCTYPE LexicalResource") && n.includes("<LexicalResource")
);
}), s.on("error", () => {
o(!1);
});
});
}
async function H(r, o = !1) {
o && console.log("[DEBUG] Quick scanning file for version and element count...");
const s = await C(r, "utf-8");
let n = "1.0";
const t = s.match(W);
if (t?.[1]) {
const i = t[1];
o && console.log(`[DEBUG] Found DOCTYPE with schema: ${i}`);
let a = !1;
for (const [h, y] of Object.entries(O))
if (y === i) {
n = h, a = !0, o && console.log(`[DEBUG] Matched schema URL to supported version: ${n}`);
break;
}
if (!a) {
const h = i.match(/WN-LMF-([0-9]+\.[0-9]+)\.dtd$/);
h && h[1] && (n = h[1], o && console.log(`[DEBUG] Extracted unsupported version from schema URL: ${n}`));
}
} else
o && console.log(`[DEBUG] No DOCTYPE pattern found, using default version: ${n}`);
const f = (s.match(/<\/[^>]+>/g) || []).length + (s.match(/\/>/g) || []).length;
return o && console.log(`[DEBUG] Quick scan: version=${n}, estimated elements=${f}`), { version: n, elementCount: f };
}
async function X(r, o = {}) {
const { debug: s = !1 } = o;
s && console.log(`[DEBUG] Loading LMF from URL: ${r}`);
try {
const n = await fetch(r);
if (!n.ok)
throw new Error(`HTTP ${n.status}: ${n.statusText}`);
const t = await n.text();
return s && console.log(`[DEBUG] Loaded ${t.length} characters from URL`), t;
} catch (n) {
throw s && console.log("[DEBUG] URL loading failed:", n), new Error(`Failed to load LMF from URL: ${n instanceof Error ? n.message : "Unknown error"}`);
}
}
async function J(r, o = {}) {
const { debug: s = !1 } = o;
s && console.log(`[DEBUG] loadLMF() starting for: ${r}`);
try {
let n, t;
const f = M(r);
if (s && console.log(`[DEBUG] isURL(${r}) = ${f}`), f) {
s && console.log("[DEBUG] Input is URL, loading from network"), n = await X(r, o);
const p = await b.mkdtemp(G(require("os").tmpdir(), "wn-ts-lmf-"));
t = G(p, "temp.lmf"), await b.writeFile(t, n, "utf-8"), s && console.log(`[DEBUG] Created temporary file: ${t}`);
} else if (t = r, s) {
const p = await _(t), U = (p.size / (1024 * 1024)).toFixed(2);
console.log(`[DEBUG] File size: ${U} MB (${p.size.toLocaleString()} bytes)`);
}
const { version: i, elementCount: a } = await H(t, s);
if (s && console.log(`[DEBUG] Quick scan returned version: ${i}`), s && console.log(`[DEBUG] Supported versions: ${Array.from(x).join(", ")}`), s && console.log(`[DEBUG] Version ${i} supported: ${x.has(i)}`), !x.has(i))
throw s && console.log(`[DEBUG] Throwing error for unsupported version: ${i}`), new w(`Unsupported LMF version: ${i}`, "UNSUPPORTED_VERSION", { version: i });
try {
n || (n = await C(t, "utf-8")), P(n, s);
} catch (p) {
throw p instanceof w ? new Error(`Failed to load LMF file: ${p.message}`) : new Error(`Failed to load LMF file: ${p instanceof Error ? p.message : "Unknown error"}`);
}
s && console.log(`[DEBUG] Using streaming parser for version ${i}...`);
const h = Date.now(), y = await q(t, i, a, o), $ = Date.now() - h;
if (s && console.log(`[DEBUG] loadLMF() completed in ${$}ms total`), M(r) && t !== r)
try {
await b.unlink(t), await b.rmdir(require("path").dirname(t)), s && console.log(`[DEBUG] Cleaned up temporary file: ${t}`);
} catch (p) {
s && console.log(`[DEBUG] Failed to clean up temporary file: ${p}`);
}
return y;
} catch (n) {
throw s && console.log("[DEBUG] loadLMF() error:", n), new Error(`Failed to load LMF file: ${n}`);
}
}
async function q(r, o, s, n = {}) {
const { debug: t = !1, progress: f } = n;
return new Promise((i, a) => {
const h = [], y = [], $ = [], p = [], U = /* @__PURE__ */ new Map();
let g = null, l = null, c = null, u = null, L = null, D = 0, F = 0;
const k = I.createStream(!0, {
trim: !0,
normalize: !0,
lowercase: !0,
position: !1,
xmlns: !1
}), T = () => {
if (f) {
const d = s < 100 ? 2 : 1e3;
if (D - F >= d) {
const e = Math.min(D / s, 0.95);
t && console.log(`[DEBUG] Progress update: ${D}/${s} = ${e}`), f(e), F = D;
}
}
};
k.on("opentag", (d) => {
D++, T();
const { attributes: e } = d, m = d.name.toLowerCase();
switch (t && (D % 1e4 === 0 || ["lexicalresource", "lexicon", "lexicalentry"].includes(m)) && console.log(`[DEBUG] Processing tag: ${m} (element #${D})`), t && ["synset", "lexicalentry"].includes(m) && D % 5e3 === 0 && console.log(`[DEBUG] Processing ${m} #${D} - current progress: ${Math.round(D / s * 100)}%`), m) {
case "lexicalresource":
t && console.log(`[DEBUG] Starting to parse LexicalResource (version: ${o})`);
break;
case "lexicon":
g = {
id: e.id || "",
label: e.label || "",
language: e.language || "en",
version: e.version || "1.0",
email: e.email || "",
license: e.license || "",
url: e.url,
citation: e.citation,
logo: e.logo,
entries: [],
synsets: [],
frames: []
}, t && console.log(`[DEBUG] Processing lexicon: ${g.id}`);
break;
case "lexiconextension":
g = {
id: e.id || "",
label: e.label || "",
language: e.language || "en",
version: e.version || "1.0",
email: e.email || "",
license: e.license || "",
url: e.url,
citation: e.citation,
logo: e.logo,
entries: [],
synsets: [],
frames: []
}, t && console.log(`[DEBUG] Processing lexicon extension: ${g.id}`);
break;
case "lexicalentry":
l = {
id: e.id || "unknown-word",
lemma: "unknown",
partOfSpeech: "n",
language: g?.language || "en",
lexicon: g?.id || "unknown",
forms: [],
tags: [],
pronunciations: [],
counts: [],
senses: [],
frames: []
}, t && console.log(`[DEBUG] Created word: ${l.id}`);
break;
case "lemma":
l && (l.lemma = e.writtenform || e.writtenForm || "unknown", l.pos = e.partofspeech || e.partOfSpeech || "n", t && console.log(`[DEBUG] Set lemma for word ${l.id}: ${l.lemma} (${l.pos})`));
break;
case "form":
l && l.forms.push({
id: e.id || "",
writtenForm: e.writtenform || e.writtenForm || "",
script: e.script || "",
tag: e.tag || ""
});
break;
case "sense":
u = {
id: e.id || "unknown-sense",
wordId: l?.id || "unknown",
synsetId: e.synset || "unknown-synset",
counts: [],
examples: [],
tags: []
};
break;
case "synset":
c = {
id: e.id || "unknown-synset",
pos: e.pos || e.partofspeech || e.partOfSpeech || "n",
definitions: [],
examples: [],
relations: [],
language: g?.language || "en",
lexicon: g?.id || "unknown",
memberIds: [],
senseIds: []
}, e.ili && (c.ili = e.ili);
break;
case "definition":
if (c) {
const E = e.id || `${c.id}-def-${c.definitions.length + 1}`;
c.definitions.push({
id: E,
language: e.language || g?.language || "en",
text: "",
source: e.source || ""
});
}
break;
case "synsetrelation":
c && c.relations.push({
id: e.id || "",
type: e.reltype || e.relType || e.type || "unknown",
target: e.target || "",
source: e.source || ""
});
break;
case "tag":
l ? l.tags.push({
id: e.id || "",
category: e.category || "",
value: ""
}) : u && u.tags.push({
id: e.id || "",
category: e.category || "",
value: ""
});
break;
case "count":
u && u.counts.push({
id: e.id || "",
value: 0,
writtenForm: "",
pos: "n"
});
break;
case "pronunciation":
if (l?.forms.length) {
const E = l.forms[l.forms.length - 1];
E && (E.pronunciations || (E.pronunciations = []), E.pronunciations.push({
id: e.id || "",
variety: e.variety || "",
text: "",
source: e.source || ""
}));
}
break;
case "syntacticbehaviour":
l && l.frames.push({
id: e.id || "",
subcategorizationFrame: e.subcategorizationframe || e.subcategorizationFrame || "",
source: e.source || "",
senses: e.senses || ""
});
break;
case "senserelation":
u && (u.relations || (u.relations = []), u.relations.push({
id: e.id || "",
type: e.reltype || e.relType || e.type || "unknown",
target: e.target || "",
dcType: e.dctype || e.dc_type || ""
}));
break;
case "ilidefinition":
c && (c.iliDefinitions || (c.iliDefinitions = []), c.iliDefinitions.push({
id: e.id || "",
text: ""
}));
break;
case "example":
L = {
id: e.id || "",
language: e.language || g?.language || "en",
text: "",
source: e.source || ""
};
break;
}
}), k.on("text", (d) => {
if (c?.definitions.length) {
const e = c.definitions[c.definitions.length - 1];
e && e.text === "" && (e.text = d.trim());
}
if (L && L.text === "" && (L.text = d.trim()), c && c.iliDefinitions?.length) {
const e = c.iliDefinitions[c.iliDefinitions.length - 1];
e && e.text === "" && (e.text = d.trim());
}
if (l?.tags.length) {
const e = l.tags[l.tags.length - 1];
e && e.value === "" && (e.value = d.trim());
}
if (u?.tags.length) {
const e = u.tags[u.tags.length - 1];
e && e.value === "" && (e.value = d.trim());
}
if (u?.counts.length) {
const e = u.counts[u.counts.length - 1];
if (e && e.writtenForm === "") {
e.writtenForm = d.trim();
const m = parseInt(d.trim());
isNaN(m) || (e.value = m);
}
}
if (l?.forms.length) {
const e = l.forms[l.forms.length - 1];
if (e?.pronunciations?.length) {
const m = e.pronunciations[e.pronunciations.length - 1];
m && m.text === "" && (m.text = d.trim());
}
}
}), k.on("closetag", (d) => {
switch (d.toLowerCase()) {
case "lexicalentry":
l && g && (g.entries.push(l), $.push(l), t && console.log(`[DEBUG] Added entry to lexicon: ${l.id}`), l = null);
break;
case "sense":
u && l && (l.senses.push(u), p.push(u), U.has(u.synsetId) || U.set(u.synsetId, []), U.get(u.synsetId).push(u), t && console.log(`[DEBUG] Added sense to entry: ${u.id}`), u = null);
break;
case "synset":
if (c && g) {
const m = U.get(c.id) || [];
c.senseIds = m.map((E) => E.id), c.memberIds = m.map((E) => E.wordId), g.synsets.push(c), y.push(c), t && console.log(`[DEBUG] Added synset to lexicon: ${c.id}`), c = null;
}
break;
case "lexicon":
g && (h.push(g), t && console.log(`[DEBUG] Added lexicon: ${g.id}`), g = null);
break;
case "lexiconextension":
g && (h.push(g), t && console.log(`[DEBUG] Added lexicon extension: ${g.id}`), g = null);
break;
case "lexicalresource":
break;
case "example":
L && (c ? c.examples.push(L) : u && u.examples.push(L), L = null);
break;
}
}), k.on("end", () => {
t && console.log("[DEBUG] Stream ended, completing parsing"), f && f(1);
let d = {
lmfVersion: o,
lexicons: h.map((e) => ({
id: e.id,
label: e.label,
language: e.language,
email: e.email,
license: e.license,
version: e.version,
url: e.url,
citation: e.citation,
logo: e.logo,
requires: e.requires,
metadata: e.metadata
})),
synsets: y,
words: $,
senses: p
};
if (n.duplicateHandling && n.duplicateHandling.strategy !== "skip") {
t && (console.log(`[DEBUG] Applying duplicate handling with strategy: ${n.duplicateHandling.strategy}`), console.log(`[DEBUG] Before deduplication - words: ${$.length}, synsets: ${y.length}, senses: ${p.length}`));
try {
d = A(d, n.duplicateHandling), t && console.log(`[DEBUG] After deduplication - words: ${d.words.length}, synsets: ${d.synsets.length}, senses: ${d.senses.length}`);
} catch (e) {
if (e instanceof w) {
a(e);
return;
}
a(new w(
`Duplicate handling failed: ${e instanceof Error ? e.message : "Unknown error"}`,
"DUPLICATE_HANDLING_FAILED",
{ originalError: e }
));
return;
}
}
i(d);
}), k.on("error", (d) => {
t && console.log("[DEBUG] Parser error:", d), a(new Error(`XML parsing error: ${d.message}`));
});
const B = S(r, { encoding: "utf8" });
t && console.log(`[DEBUG] Created read stream for ${r}`), B.pipe(k), B.on("error", (d) => {
t && console.log("[DEBUG] Stream error:", d), a(new Error(`File stream error: ${d.message}`));
});
});
}
export {
j as isLMF,
J as loadLMF
};