ts-textrank
Version:
Typescript TextRank implementation
80 lines (79 loc) • 2.45 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const stopword_1 = __importDefault(require("stopword"));
const Sentence_1 = __importDefault(require("./Sentence"));
const Word_1 = __importDefault(require("./Word"));
const Text_1 = __importDefault(require("./Text"));
class DefaultTextParser {
/**
*
* @param sentence
*/
parse(text, language) {
const sentences = {};
let sent;
//Strategy: split text by sentence boundaries:
//- a dot followed by anything but a wordy
//- \n,¡!¿? in any position
const rxSep = /\.[^\w]|[\n¡!\?¿]/g;
const parts = text.split(rxSep);
let position = 1;
parts.map(rawSentence => {
rawSentence = rawSentence.trim();
if (rawSentence === "") {
return;
}
sent = this.buildSentence(rawSentence, language, position);
sentences[sent.getNormalized()] = sent;
position++;
});
return new Text_1.default(text, Object.values(sentences));
}
/**
*
* @param raw
* @param limiter
* @param language
* @returns the sentence model
*/
buildSentence(raw, language, position) {
const tokens = this.tokenize(raw);
const words = tokens.map(t => new Word_1.default(t));
const filtered = this.filter(words, language);
return new Sentence_1.default(raw, filtered, position);
}
/**
* removes stopwords
* @param words
* @param language
* @returns the filtered list
*/
filter(words, language) {
//@ts-ignore
let stop = stopword_1.default[language];
return words.filter(w => {
return !stop.includes(w.normalized) && !stop.includes(w.word);
});
}
/**
*
* @param sentence
* @returns the list of tokens of a sentence
*/
tokenize(sentence) {
const parts = sentence
//remove all non-word
.replace(/[^\p{L}\s]/gu, "")
//collapse multiple spaces
.replace(/\s+/g, " ")
//remove trailing spaces
.trim()
//split by space
.split(" ");
return parts;
}
}
exports.default = DefaultTextParser;