UNPKG

@readium/navigator-html-injectables

Version:

An embeddable solution for connecting frames of HTML publications with a Readium Navigator

164 lines (143 loc) 4.95 kB
import approxSearch from '../../approx-string-match'; import type { Match as StringMatch } from '../../approx-string-match'; type Match = { /** Start offset of match in text */ start: number; /** End offset of match in text */ end: number; /** * Score for the match between 0 and 1.0, where 1.0 indicates a perfect match * for the quote and context. */ score: number; }; /** * Find the best approximate matches for `str` in `text` allowing up to * `maxErrors` errors. */ function search(text: string, str: string, maxErrors: number): StringMatch[] { // Do a fast search for exact matches. The `approx-string-match` library // doesn't currently incorporate this optimization itself. let matchPos = 0; const exactMatches: StringMatch[] = []; while (matchPos !== -1) { matchPos = text.indexOf(str, matchPos); if (matchPos !== -1) { exactMatches.push({ start: matchPos, end: matchPos + str.length, errors: 0, }); matchPos += 1; } } if (exactMatches.length > 0) { return exactMatches; } // If there are no exact matches, do a more expensive search for matches // with errors. return approxSearch(text, str, maxErrors); } /** * Compute a score between 0 and 1.0 for the similarity between `text` and `str`. */ function textMatchScore(text: string, str: string) { // `search` will return no matches if either the text or pattern is empty, // otherwise it will return at least one match if the max allowed error count // is at least `str.length`. if (str.length === 0 || text.length === 0) { return 0.0; } const matches = search(text, str, str.length); // prettier-ignore return 1 - (matches[0].errors / str.length); } type Context = { /** Expected text before the quote */ prefix?: string; /** Expected text after the quote */ suffix?: string; /** Expected offset of match within text */ hint?: number; }; /** * Find the best approximate match for `quote` in `text`. * * @param text - Document text to search * @param quote - String to find within `text` * @param context - Context in which the quote originally appeared. This is * used to choose the best match. * @return `null` if no match exceeding the minimum quality threshold was found. */ export function matchQuote( text: string, quote: string, context: Context = {} ): Match | null { if (quote.length === 0) { return null; } // Choose the maximum number of errors to allow for the initial search. // This choice involves a tradeoff between: // // - Recall (proportion of "good" matches found) // - Precision (proportion of matches found which are "good") // - Cost of the initial search and of processing the candidate matches [1] // // [1] Specifically, the expected-time complexity of the initial search is // `O((maxErrors / 32) * text.length)`. See `approx-string-match` docs. const maxErrors = Math.min(256, quote.length / 2); // Find the closest matches for `quote` in `text` based on edit distance. const matches = search(text, quote, maxErrors); if (matches.length === 0) { return null; } /** * Compute a score between 0 and 1.0 for a match candidate. */ const scoreMatch = (match: StringMatch) => { const quoteWeight = 50; // Similarity of matched text to quote. const prefixWeight = 20; // Similarity of text before matched text to `context.prefix`. const suffixWeight = 20; // Similarity of text after matched text to `context.suffix`. const posWeight = 2; // Proximity to expected location. Used as a tie-breaker. const quoteScore = 1 - match.errors / quote.length; const prefixScore = context.prefix ? textMatchScore( text.slice( Math.max(0, match.start - context.prefix.length), match.start ), context.prefix ) : 1.0; const suffixScore = context.suffix ? textMatchScore( text.slice(match.end, match.end + context.suffix.length), context.suffix ) : 1.0; let posScore = 1.0; if (typeof context.hint === 'number') { const offset = Math.abs(match.start - context.hint); posScore = 1.0 - offset / text.length; } const rawScore = quoteWeight * quoteScore + prefixWeight * prefixScore + suffixWeight * suffixScore + posWeight * posScore; const maxScore = quoteWeight + prefixWeight + suffixWeight + posWeight; const normalizedScore = rawScore / maxScore; return normalizedScore; }; // Rank matches based on similarity of actual and expected surrounding text // and actual/expected offset in the document text. const scoredMatches = matches.map(m => ({ start: m.start, end: m.end, score: scoreMatch(m), })); // Choose match with the highest score. scoredMatches.sort((a, b) => b.score - a.score); return scoredMatches[0]; }