@readium/shared
Version:
Shared models to be used across other Readium projects and implementations in Typescript
40 lines (39 loc) • 1.41 kB
TypeScript
import { Language } from "../Language";
import { Tokenizer } from "./Tokenizer";
export type Range = [number, number];
/**
* A tokenizer splitting a String into range tokens (e.g. words, sentences, etc.).
*/
export type TextTokenizer = Tokenizer<string, Range>;
/**
* A text token unit which can be used with a [TextTokenizer].
*/
export declare enum TextUnit {
Word = "word",
Sentence = "sentence",
Paragraph = "paragraph"
}
export declare const DefaultTextContentTokenizer: (language: Language | null, unit: TextUnit) => TextTokenizer;
/**
* A [TextTokenizer] using the Intl.Segmenter API.
* Very aware of language-specific rules since it uses ICU behind the scenes.
*/
export declare class IntlTextTokenizer implements TextTokenizer {
private unit;
private segmenter;
constructor(language: Language | null, unit: TextUnit);
tokenize(data: string): Range[];
}
/**
* A [TextTokenizer] using a naive approach to splitting text into tokens.
* This is a fallback for browsers that don't support Intl.Segmenter.
* It works mainly on English and similar languages. Don't use unless necessary.
*/
export declare class NaiveTextTokenizer {
private unit;
private tokenizer;
private isEnglish;
constructor(language: Language | null, unit: TextUnit);
tokenize(data: string): Range[];
}
export declare const speakableToken: (token: string) => string | null;