romanize-string
Version:
A fully typed, general-purpose utility for unidirectional string transliteration (non-Latin script => Latin script).
27 lines (26 loc) • 1.34 kB
JavaScript
import { normalize as koreanNormalize, tokenize as koreanTokenize, } from "oktjs";
import { romanize } from "../vendor/romanize/korean/src/index.js";
export const romanizeKorean = (string) => {
// Normalize and tokenize string, omitting any extra white spaces
const normalized = koreanNormalize(string);
const tokens = koreanTokenize(normalized).filter((t) => t.text.trim());
// Romanize each token individually
const romanizedTokens = tokens.map((token) => romanize(token.text));
// Use the index of each romanizedToken to access the pos property of the corresponding item in the tokens array.
// If the token at the current index is a josa (Korean case marker), join it to the preceding noun with a hyphen.
const josaJoined = romanizedTokens.reduce((acc, romToken, idx) => {
if (tokens[idx].pos === "Josa" && acc.length) {
acc[acc.length - 1] += `-${romToken}`;
}
else {
acc.push(romToken);
}
return acc;
}, []);
// Join the processed array of tokens into a single string, omitting any additional extra white spaces
// that may have crept in during the process and removing white spaces before punctuation.
return josaJoined
.join(" ")
.replace(/\s+/g, " ")
.replace(/\s+([.,!?!?。、])/g, "$1");
};