@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
61 lines (60 loc) • 1.69 kB
JavaScript
import { mapcat } from "@thi.ng/transducers/mapcat";
const DEFAULT_REPLACEMENTS_EN = {
"can't": ["can", "not"],
"couldn't": ["could", "not"],
"didn't": ["did", "not"],
"doesn't": ["does", "not"],
"don't": ["do", "not"],
"hadn't": ["had", "not"],
"hasn't": ["has", "not"],
"haven't": ["has", "not"],
"he'd": ["he", "would"],
"he'll": ["he", "will"],
"he's": ["he", "is"],
"how'd": ["how", "would"],
"how're": ["how", "are"],
"how's": ["how", "is"],
"i'd": ["i", "would"],
"i'll": ["i", "will"],
"i'm": ["i", "am"],
"i've": ["i", "have"],
"isn't": ["is", "not"],
"it'd": ["it", "would"],
"it'll": ["it", "will"],
"it's": ["it", "is"],
"she'd": ["she", "would"],
"she'll": ["she", "would"],
"she's": ["she", "is"],
"should've'": ["should", "have"],
"shouldn't": ["should", "not"],
"they'd": ["they", "would"],
"they'll": ["they", "will"],
"they're": ["they", "are"],
"they've": ["they", "have"],
"wasn't": ["was", "not"],
"we'd": ["we", "would"],
"we'll": ["we", "will"],
"we're": ["we", "are"],
"we've": ["we", "have"],
"weren't": ["were", "not"],
"what'd": ["what", "would"],
"what're": ["what", "are"],
"what's": ["what", "is"],
"who'd": ["who", "would"],
"who're": ["who", "are"],
"who's": ["who", "is"],
"why'd": ["why", "would"],
"why're": ["why", "are"],
"why's": ["why", "is"],
"won't": ["will", "not"],
"wouldn't": ["would", "not"],
"you'd": ["you", "would"],
"you'll": ["you", "will"],
"you're": ["you", "are"],
"you've": ["you", "have"]
};
const replaceWith = (dict = DEFAULT_REPLACEMENTS_EN) => mapcat((x) => dict[x] ?? [x]);
export {
DEFAULT_REPLACEMENTS_EN,
replaceWith
};