compromise
Version:
modest natural language processing
414 lines (383 loc) • 15.2 kB
TypeScript
export as namespace nlp
declare interface nlp<D extends object, W extends object> {
/** normal usage */
(text: string): nlp.ExtendedDocument<D, W>
/** tozenize string */
tokenize(text: string): nlp.ExtendedDocument<D, W>
/** mix in a compromise-plugin */
extend<P>(
plugin: P
): nlp<
P extends nlp.Plugin<infer PD, infer PW> ? { [k in keyof (PD & D)]: (PD & D)[k] } : { [k in keyof D]: D[k] },
P extends nlp.Plugin<infer PD, infer PW> ? { [k in keyof (PW & W)]: (PW & W)[k] } : { [k in keyof W]: W[k] }
>
/** re-generate a Doc object from .json() results */
load(json: any): nlp.ExtendedDocument<D, W>
/** log our decision-making for debugging */
verbose(bool: boolean): nlp.ExtendedDocument<D, W>
/** current semver version of the library */
version: nlp.ExtendedDocument<D, W>
}
declare function nlp(text: string): nlp.DefaultDocument
declare function nlp<D extends object, W extends object>(text: string): nlp.ExtendedDocument<D, W>
// possible values to .json()
declare interface JsonOptions {
/** a perfect copy of the input text */
text?: boolean
/** normalized whitespace, case, unicode, punctuation */
normal?: boolean
/** lowercase, trimmed, contractions expanded. */
reduced?: boolean
/** cleanup whitespace */
trim?: boolean
/** character-position where this begins */
offset?: boolean
/** frequency of this match in the document */
count?: boolean
/** remove duplicate results*/
unique?: boolean
/** starting term # in document */
index?: boolean
/** options for each term */
terms?: {
text?: boolean
normal?: boolean
clean?: boolean
implicit?: boolean
tags?: boolean
whitespace?: boolean
id?: boolean
offset?: boolean
bestTag?: boolean
}
}
// Constructor
declare module nlp {
export function tokenize(text: string): DefaultDocument
/** mix in a compromise-plugin */
export function extend<P>(
plugin: P
): nlp<P extends Plugin<infer D, infer W> ? D : {}, P extends Plugin<infer D, infer W> ? W : {}>
/** re-generate a Doc object from .json() results */
export function load(json: any): DefaultDocument
/** log our decision-making for debugging */
export function verbose(bool: boolean): DefaultDocument
/** current semver version of the library */
export const version: number
type Plugin<D extends object, W extends object> = (
Doc: Document<World & W> & D & { prototype: D },
world: World & W
) => void
type ExtendedWorld<W extends object> = nlp.World & W
type ExtendedDocument<D extends object, W extends object> = {
[k in keyof (nlp.Document<ExtendedWorld<W>> & D)]: (nlp.Document<ExtendedWorld<W>> & D)[k]
}
type DefaultDocument = {
[k in keyof nlp.Document]: nlp.Document[k]
}
class Document<W extends World = World> {
// Utils
/** return the whole original document ('zoom out') */
all(): Document<W>
/** is this document empty? */
found: boolean
/** return the previous result */
parent(): Document<W>
/** return all of the previous results */
parents(): Document<W>[]
/** (re)run the part-of-speech tagger on this document */
tagger(): Document<W>
/** count the # of terms in each match */
wordCount(): number
/** count the # of characters of each match */
length(): number
/** deep-copy the document, so that no references remain */
clone(shallow?: boolean): Document<W>
/** freeze the current state of the document, for speed-purposes */
cache(options?: object): Document<W>
/** un-freezes the current state of the document, so it may be transformed */
uncache(options?: object): Document<W>
/** the current world */
world: W
// Accessors
/** use only the first result(s) */
first(n?: number): Document<W>
/** use only the last result(s) */
last(n?: number): Document<W>
/** grab a subset of the results */
slice(start: number, end?: number): Document<W>
/** use only the nth result */
eq(n: number): Document<W>
/** get the first word in each match */
firstTerm(): Document<W>
/** get the end word in each match */
lastTerm(): Document<W>
/** return a flat list of all Term objects in match */
termList(): any
// Match
/** return a new Doc, with this one as a parent */
match(match: string | Document<W>): Document<W>
/** return all results except for this */
not(match: string | Document<W>): Document<W>
/** return only the first match */
matchOne(match: string | Document<W>): Document<W>
/** return each current phrase, only if it contains this match */
if(match: string | Document<W>): Document<W>
/** Filter-out any current phrases that have this match */
ifNo(match: string | Document<W>): Document<W>
/** Return a boolean if this match exists */
has(match: string | Document<W>): boolean
/** search through earlier terms, in the sentence */
lookBehind(match: string | Document<W>): Document<W>
/** search through following terms, in the sentence */
lookAhead(match: string | Document<W>): Document<W>
/** return the terms before each match */
before(match: string | Document<W>): Document<W>
/** return the terms after each match */
after(match: string | Document<W>): Document<W>
/** quick find for an array of string matches */
lookup(matches: string[]): Document<W>
// Case
/** turn every letter of every term to lower-cse */
toLowerCase(): Document<W>
/** turn every letter of every term to upper case */
toUpperCase(): Document<W>
/** upper-case the first letter of each term */
toTitleCase(): Document<W>
/** remove whitespace and title-case each term */
toCamelCase(): Document<W>
// Whitespace
/** add this punctuation or whitespace before each match */
pre(str: string, concat: boolean): Document<W>
/** add this punctuation or whitespace after each match */
post(str: string, concat: boolean): Document<W>
/** remove start and end whitespace */
trim(): Document<W>
/** connect words with hyphen, and remove whitespace */
hyphenate(): Document<W>
/** remove hyphens between words, and set whitespace */
dehyphenate(): Document<W>
// Tag
/** Give all terms the given tag */
tag(tag: string, reason?: string): Document<W>
/** Only apply tag to terms if it is consistent with current tags */
tagSafe(tag: string, reason?: string): Document<W>
/** Remove this term from the given terms */
unTag(tag: string, reason?: string): Document<W>
/** return only the terms that can be this tag */
canBe(tag: string): Document<W>
// Loops
/** run each phrase through a function, and create a new document */
map(fn: Function): Document<W> | []
/** run a function on each phrase, as an individual document */
forEach(fn: Function): Document<W>
/** return only the phrases that return true */
filter(fn: Function): Document<W>
/** return a document with only the first phrase that matches */
find(fn: Function): Document<W> | undefined
/** return true or false if there is one matching phrase */
some(fn: Function): Document<W>
/** sample a subset of the results */
random(n?: number): Document<W>
// Insert
/** substitute-in new content */
replaceWith(text: string | Function, keepTags?: boolean | object, keepCase?: boolean): Document<W>
/** search and replace match with new content */
replace(match: string, text?: string | Function, keepTags?: boolean | object, keepCase?: boolean): Document<W>
/** fully remove these terms from the document */
delete(match: string): Document<W>
/** add these new terms to the end (insertAfter) */
append(text: string): Document<W>
/** add these new terms to the front (insertBefore) */
prepend(text: string): Document<W>
/** add these new things to the end */
concat(text: string): Document<W>
// transform
/**re-arrange the order of the matches (in place) */
sort(method?: string | Function): Document<W>
/**reverse the order of the matches, but not the words */
reverse(): Document<W>
/** clean-up the document, in various ways */
normalize(options?: string | object): string
/** remove any duplicate matches */
unique(): Document<W>
/** return a Document with three parts for every match ('splitOn') */
split(match?: string): Document<W>
/** separate everything after the match as a new phrase */
splitBefore(match?: string): Document<W>
/** separate everything before the word, as a new phrase */
splitAfter(match?: string): Document<W>
/** split a document into labeled sections */
segment(regs: object, options?: object): Document<W>
/** make all phrases into one phrase */
join(str?: string): Document<W>
// Output
/** return the document as text */
text(options?: string | object): string
/** pull out desired metadata from the document */
json(options?: JsonOptions | string): any
/** some named output formats */
out(format?: 'text' | 'normal' | 'offset' | 'terms'): string
out(format: 'array'): string[]
out(format: 'tags' | 'terms'): Array<{ normal: string; text: string; tags: string[] }>
out(format: 'json'): Array<{ normal: string; text: string; tags: () => void }>[]
out(format: 'debug'): Text
out(format: 'topk'): Array<{ normal: string; count: number; percent: number }>
/** pretty-print the current document and its tags */
debug(): Document<W>
/** store a parsed document for later use */
export(): any
// Selections
/** split-up results by each individual term */
terms(n?: number): Document<W>
/** split-up results into multi-term phrases */
clauses(n?: number): Document<W>
/** return all terms connected with a hyphen or dash like `'wash-out'`*/
hyphenated(n?: number): Document<W>
/** add quoation marks around each match */
toQuoations(start?: string, end?: string): Document<W>
/** add brackets around each match */
toParentheses(start?: string, end?: string): Document<W>
/** return things like `'(939) 555-0113'` */
phoneNumbers(n?: number): Document<W>
/** return things like `'#nlp'` */
hashTags(n?: number): Document<W>
/** return things like `'hi@compromise.cool'` */
emails(n?: number): Document<W>
/** return things like `:)` */
emoticons(n?: number): Document<W>
/** return athings like `💋` */
emoji(n?: number): Document<W>
/** return things like `'@nlp_compromise'`*/
atMentions(n?: number): Document<W>
/** return things like `'compromise.cool'` */
urls(n?: number): Document<W>
/** return things like `'quickly'` */
adverbs(n?: number): Document<W>
/** return things like `'he'` */
pronouns(n?: number): Document<W>
/** return things like `'but'`*/
conjunctions(n?: number): Document<W>
/** return things like `'of'`*/
prepositions(n?: number): Document<W>
/** return person names like `'John A. Smith'`*/
people(n?: number): Document<W>
/** return location names like `'Paris, France'`*/
places(n?: number): Document<W>
/** return companies and org names like `'Google Inc.'`*/
organizations(n?: number): Document<W>
/** return people, places, and organizations */
topics(n?: number): Document<W>
// Subsets
/** alias for .all(), until plugin overloading */
sentences(): Document<W>
/** return things like `'Mrs.'`*/
abbreviations(n?: number): Abbreviations<W>
/** return any multi-word terms, like "didn't" */
contractions(n?: number): Contractions<W>
/** contract words that can combine, like "did not" */
contract(): Document<W>
/** return anything inside (parentheses) */
parentheses(n?: number): Parentheses<W>
/** return things like "Spencer's" */
possessives(n?: number): Possessives<W>
/** return any terms inside 'quotation marks' */
quotations(n?: number): Quotations<W>
/** return things like `'FBI'` */
acronyms(n?: number): Acronyms<W>
/** return things like `'eats, shoots, and leaves'` */
lists(n?: number): Lists<W>
/** return any subsequent terms tagged as a Noun */
nouns(n?: number): Nouns<W>
/** return any subsequent terms tagged as a Verb */
verbs(n?: number): Verbs<W>
}
// Nouns class
interface Nouns<W extends World = World> extends ExtendedDocument<{}, W> {
/** get any adjectives describing this noun*/
adjectives(): Document<W>
/** return only plural nouns */
isPlural(): Document<W>
/** return only nouns that _can be_ inflected as plural */
hasPlural(): Document<W>
/** 'football captain' → 'football captains' */
toPlural(setArticle?: boolean): Document<W>
/** 'turnovers' → 'turnover' */
toSingular(setArticle?: boolean): Document<W>
/** add a `'s` to the end, in a safe manner. */
toPossessive(): Document<W>
}
// Verbs class
interface Verbs<W extends World = World> extends Document<W> {
/** return the adverbs describing this verb */
adverbs(): Document<W>
/** return only plural nouns */
isPlural(): Document<W>
/** return only singular nouns */
isSingular(): Document<W>
/** return all forms of these verbs */
conjugate(): Document<W>
/** 'will go' → 'went' */
toPastTense(): Document<W>
/** 'walked' → 'walks' */
toPresentTense(): Document<W>
/** 'walked' → 'will walk' */
toFutureTense(): Document<W>
/** 'walks' → 'walk' */
toInfinitive(): Document<W>
/** 'walks' → 'walking' */
toGerund(): Document<W>
/** return verbs with 'not' */
isNegative(): Document<W>
/** only verbs without 'not'*/
isPositive(): Document<W>
/** 'went' → 'did not go'*/
toNegative(): Document<W>
/** "didn't study" → 'studied' */
toPositive(): Document<W>
}
interface Abbreviations<W extends World = World> extends Document<W> {
/** */
stripPeriods(): Document<W>
/** */
addPeriods(): Document<W>
}
interface Acronyms<W extends World = World> extends Document<W> {
/** */
stripPeriods(): Document<W>
/** */
addPeriods(): Document<W>
}
interface Contractions<W extends World = World> extends Document<W> {
/** */
expand(): Document<W>
}
interface Parentheses<W extends World = World> extends Document<W> {
/** */
unwrap(): Document<W>
}
interface Possessives<W extends World = World> extends Document<W> {
/** */
strip(): Document<W>
}
interface Quotations<W extends World = World> extends Document<W> {
/** */
unwrap(): Document<W>
}
interface Lists<W extends World = World> extends Document<W> {
/** */
conjunctions(): Document<W>
/** */
parts(): Document<W>
/** */
items(): Document<W>
/** */
add(): Document<W>
/** */
remove(): Document<W>
/** */
hasOxfordComma(): Document<W>
}
class World {}
}
export default nlp