UNPKG

hanzi-tools

Version:

Converts from Chinese characters to pinyin, between simplified and traditional, and does word segmentation.

27 lines (24 loc) 732 B
let nodejieba = require("nodejieba"); let { pinyinDict } = require("./pinyinDict"); function segment(text) { let cut = nodejieba.cut(text); let segments = []; cut.forEach((text) => { segments = segments.concat(segmentPart(text)); }); return segments; } function segmentPart(text) { if (!text.length) return []; if (text in pinyinDict || text.length === 1) { return [text]; } for (let i = text.length - 1; i > 0; i--) { let part = text.slice(0, i); if (part in pinyinDict) { return [part].concat(segmentPart(text.slice(i))); } } return [text[0]].concat(segmentPart(text.slice(1))) } module.exports = segment;