@elibrary-inno/bookreader
Version:
The Internet Archive BookReader.
634 lines (563 loc) • 21.4 kB
JavaScript
//@ts-check
import { createDIVPageLayer } from '../BookReader/PageContainer.js';
import { SelectionObserver } from '../BookReader/utils/SelectionObserver.js';
import { applyVariables } from '../util/strings.js';
/** @typedef {import('../util/strings.js').StringWithVars} StringWithVars */
/** @typedef {import('../BookReader/PageContainer.js').PageContainer} PageContainer */
const BookReader = /** @type {typeof import('../BookReader').default} */(window.BookReader);
export const DEFAULT_OPTIONS = {
enabled: true,
/** @type {StringWithVars} The URL to fetch the entire DJVU xml. Supports options.vars */
fullDjvuXmlUrl: null,
/** @type {StringWithVars} The URL to fetch a single page of the DJVU xml. Supports options.vars. Also has {{pageIndex}} */
singlePageDjvuXmlUrl: null,
/** Whether to fetch the XML as a jsonp */
jsonp: false,
};
/** @typedef {typeof DEFAULT_OPTIONS} TextSelectionPluginOptions */
/**
* @template T
*/
export class Cache {
constructor(maxSize = 10) {
this.maxSize = maxSize;
/** @type {T[]} */
this.entries = [];
}
/**
* @param {T} entry
*/
add(entry) {
if (this.entries.length >= this.maxSize) {
this.entries.shift();
}
this.entries.push(entry);
}
}
export class TextSelectionPlugin {
/**
* @param {'lr' | 'rl'} pageProgression In the future this should be in the ocr file
* since a book being right to left doesn't mean the ocr is right to left. But for
* now we do make that assumption.
*/
constructor(options = DEFAULT_OPTIONS, optionVariables, pageProgression = 'lr') {
this.options = options;
this.optionVariables = optionVariables;
/**@type {PromiseLike<JQuery<HTMLElement>|undefined>} */
this.djvuPagesPromise = null;
/** Whether the book is right-to-left */
this.rtl = pageProgression === 'rl';
/** @type {Cache<{index: number, response: any}>} */
this.pageTextCache = new Cache();
/**
* Sometimes there are too many words on a page, and the browser becomes near
* unusable. For now don't render text layer for pages with too many words.
*/
this.maxWordRendered = 2500;
this.selectionObserver = new SelectionObserver('.BRtextLayer', this._onSelectionChange);
}
/**
* @param {'started' | 'cleared'} type
* @param {HTMLElement} target
*/
_onSelectionChange = (type, target) => {
if (type === 'started') {
this.textSelectingMode(target);
} else if (type === 'cleared') {
this.defaultMode(target);
} else {
throw new Error(`Unknown type ${type}`);
}
}
init() {
this.selectionObserver.attach();
// Only fetch the full djvu xml if the single page url isn't there
if (this.options.singlePageDjvuXmlUrl) return;
this.djvuPagesPromise = $.ajax({
type: "GET",
url: applyVariables(this.options.fullDjvuXmlUrl, this.optionVariables),
dataType: this.options.jsonp ? "jsonp" : "html",
cache: true,
error: (e) => undefined
}).then((res) => {
try {
const xmlMap = $.parseXML(res);
return xmlMap && $(xmlMap).find("OBJECT");
} catch (e) {
return undefined;
}
});
}
/**
* @param {number} index
* @returns {Promise<HTMLElement|undefined>}
*/
async getPageText(index) {
if (this.options.singlePageDjvuXmlUrl) {
const cachedEntry = this.pageTextCache.entries.find(x => x.index == index);
if (cachedEntry) {
return cachedEntry.response;
}
const res = await $.ajax({
type: "GET",
url: applyVariables(this.options.singlePageDjvuXmlUrl, this.optionVariables, { pageIndex: index }),
dataType: this.options.jsonp ? "jsonp" : "html",
cache: true,
error: (e) => undefined,
});
try {
const xmlDoc = $.parseXML(res);
const result = xmlDoc && $(xmlDoc).find("OBJECT")[0];
this.pageTextCache.add({ index, response: result });
return result;
} catch (e) {
return undefined;
}
} else {
const XMLpagesArr = await this.djvuPagesPromise;
if (XMLpagesArr) return XMLpagesArr[index];
}
}
/**
* Intercept copied text to remove any styling applied to it
* @param {JQuery} $container
*/
interceptCopy($container) {
$container[0].addEventListener('copy', (event) => {
const selection = document.getSelection();
event.clipboardData.setData('text/plain', selection.toString());
event.preventDefault();
});
}
/**
* Applies mouse events when in default mode
* @param {HTMLElement} textLayer
*/
defaultMode(textLayer) {
const $pageContainer = $(textLayer).closest('.BRpagecontainer');
textLayer.style.pointerEvents = "none";
$pageContainer.find("img").css("pointer-events", "auto");
$(textLayer).off(".textSelectPluginHandler");
const startedMouseDown = this.mouseIsDown;
let skipNextMouseup = this.mouseIsDown;
if (startedMouseDown) {
textLayer.style.pointerEvents = "auto";
}
// Need to stop propagation to prevent DragScrollable from
// blocking selection
$(textLayer).on("mousedown.textSelectPluginHandler", (event) => {
this.mouseIsDown = true;
if ($(event.target).is(".BRwordElement, .BRspace")) {
event.stopPropagation();
}
});
$(textLayer).on("mouseup.textSelectPluginHandler", (event) => {
this.mouseIsDown = false;
textLayer.style.pointerEvents = "none";
if (skipNextMouseup) {
skipNextMouseup = false;
event.stopPropagation();
}
});
}
/**
* This mode is active while there is a selection on the given textLayer
* @param {HTMLElement} textLayer
*/
textSelectingMode(textLayer) {
const $pageContainer = $(textLayer).closest('.BRpagecontainer');
// Make text layer consume all events
textLayer.style.pointerEvents = "all";
// Block img from getting long-press to save while selecting
$pageContainer.find("img").css("pointer-events", "none");
$(textLayer).off(".textSelectPluginHandler");
$(textLayer).on('mousedown.textSelectPluginHandler', (event) => {
this.mouseIsDown = true;
event.stopPropagation();
});
// Prevent page flip on click
$(textLayer).on('mouseup.textSelectPluginHandler', (event) => {
this.mouseIsDown = false;
event.stopPropagation();
});
}
/**
* Initializes text selection modes if there is a text layer on the page
* @param {JQuery} $container
*/
stopPageFlip($container) {
/** @type {JQuery<HTMLElement>} */
const $textLayer = $container.find('.BRtextLayer');
if (!$textLayer.length) return;
$textLayer.each((i, s) => this.defaultMode(s));
this.interceptCopy($container);
}
/**
* @param {PageContainer} pageContainer
*/
async createTextLayer(pageContainer) {
const pageIndex = pageContainer.page.index;
const $container = pageContainer.$container;
const $textLayers = $container.find('.BRtextLayer');
if ($textLayers.length) return;
const XMLpage = await this.getPageText(pageIndex);
if (!XMLpage) return;
recursivelyAddCoords(XMLpage);
const totalWords = $(XMLpage).find("WORD").length;
if (totalWords > this.maxWordRendered) {
console.log(`Page ${pageIndex} has too many words (${totalWords} > ${this.maxWordRendered}). Not rendering text layer.`);
return;
}
const textLayer = createDIVPageLayer(pageContainer.page, 'BRtextLayer');
const ratioW = parseFloat(pageContainer.$container[0].style.width) / pageContainer.page.width;
const ratioH = parseFloat(pageContainer.$container[0].style.height) / pageContainer.page.height;
textLayer.style.transform = `scale(${ratioW}, ${ratioH})`;
textLayer.setAttribute("dir", this.rtl ? "rtl" : "ltr");
const ocrParagraphs = $(XMLpage).find("PARAGRAPH[coords]").toArray();
const paragEls = ocrParagraphs.map(p => {
const el = this.renderParagraph(p);
textLayer.appendChild(el);
return el;
});
// Fix up paragraph positions
const paragraphRects = determineRealRects(textLayer, '.BRparagraphElement');
let yAdded = 0;
for (const [ocrParagraph, paragEl] of zip(ocrParagraphs, paragEls)) {
const ocrParagBounds = $(ocrParagraph).attr("coords").split(",").map(parseFloat);
const realRect = paragraphRects.get(paragEl);
const [ocrLeft, , ocrRight, ocrTop] = ocrParagBounds;
const newStartMargin = this.rtl ? (realRect.right - ocrRight) : (ocrLeft - realRect.left);
const newTop = ocrTop - (realRect.top + yAdded);
paragEl.style[this.rtl ? 'marginRight' : 'marginLeft'] = `${newStartMargin}px`;
paragEl.style.marginTop = `${newTop}px`;
yAdded += newTop;
textLayer.appendChild(paragEl);
}
$container.append(textLayer);
this.stopPageFlip($container);
}
/**
* @param {HTMLElement} ocrParagraph
* @returns {HTMLParagraphElement}
*/
renderParagraph(ocrParagraph) {
const paragEl = document.createElement('p');
paragEl.classList.add('BRparagraphElement');
const [paragLeft, paragBottom, paragRight, paragTop] = $(ocrParagraph).attr("coords").split(",").map(parseFloat);
const wordHeightArr = [];
const lines = $(ocrParagraph).find("LINE[coords]").toArray();
if (!lines.length) return paragEl;
for (const [prevLine, line, nextLine] of lookAroundWindow(genMap(lines, augmentLine))) {
const isLastLineOfParagraph = line.ocrElement == lines[lines.length - 1];
const lineEl = document.createElement('span');
lineEl.classList.add('BRlineElement');
for (const [wordIndex, currWord] of line.words.entries()) {
const [, bottom, right, top] = $(currWord).attr("coords").split(',').map(parseFloat);
const wordHeight = bottom - top;
wordHeightArr.push(wordHeight);
if (wordIndex == 0 && prevLine?.lastWord.textContent.trim().endsWith('-')) {
// ideally prefer the next line to determine the left position,
// since the previous line could be the first line of the paragraph
// and hence have an incorrectly indented first word.
// E.g. https://archive.org/details/driitaleofdaring00bachuoft/page/360/mode/2up
const [newLeft, , , ] = $((nextLine || prevLine).firstWord).attr("coords").split(',').map(parseFloat);
$(currWord).attr("coords", `${newLeft},${bottom},${right},${top}`);
}
const wordEl = document.createElement('span');
wordEl.setAttribute("class", "BRwordElement");
wordEl.textContent = currWord.textContent.trim();
if (wordIndex > 0) {
const space = document.createElement('span');
space.classList.add('BRspace');
space.textContent = ' ';
lineEl.append(space);
// Edge ignores empty elements (like BRspace), so add another
// space to ensure Edge's ReadAloud works correctly.
lineEl.appendChild(document.createTextNode(' '));
}
lineEl.appendChild(wordEl);
}
const hasHyphen = line.lastWord.textContent.trim().endsWith('-');
const lastWordEl = lineEl.children[lineEl.children.length - 1];
if (hasHyphen && !isLastLineOfParagraph) {
lastWordEl.textContent = lastWordEl.textContent.trim().slice(0, -1);
lastWordEl.classList.add('BRwordElement--hyphen');
}
paragEl.appendChild(lineEl);
if (!isLastLineOfParagraph && !hasHyphen) {
// Edge does not correctly have spaces between the lines.
paragEl.appendChild(document.createTextNode(' '));
}
}
wordHeightArr.sort((a, b) => a - b);
const paragWordHeight = wordHeightArr[Math.floor(wordHeightArr.length * 0.85)] + 4;
paragEl.style.left = `${paragLeft}px`;
paragEl.style.top = `${paragTop}px`;
paragEl.style.width = `${paragRight - paragLeft}px`;
paragEl.style.height = `${paragBottom - paragTop}px`;
paragEl.style.fontSize = `${paragWordHeight}px`;
// Fix up sizes - stretch/crush words as necessary using letter spacing
let wordRects = determineRealRects(paragEl, '.BRwordElement');
const ocrWords = $(ocrParagraph).find("WORD").toArray();
const wordEls = paragEl.querySelectorAll('.BRwordElement');
for (const [ocrWord, wordEl] of zip(ocrWords, wordEls)) {
const realRect = wordRects.get(wordEl);
const [left, , right ] = $(ocrWord).attr("coords").split(',').map(parseFloat);
let ocrWidth = right - left;
// Some books (eg theworksofplato01platiala) have a space _inside_ the <WORD>
// element. That makes it impossible to determine the correct positining
// of everything, but to avoid the BRspace's being width 0, which makes selection
// janky on Chrome Android, assume the space is the same width as one of the
// letters.
if (ocrWord.textContent.endsWith(' ')) {
ocrWidth = ocrWidth * (ocrWord.textContent.length - 1) / ocrWord.textContent.length;
}
const diff = ocrWidth - realRect.width;
wordEl.style.letterSpacing = `${diff / (ocrWord.textContent.length - 1)}px`;
}
// Stretch/crush lines as necessary using line spacing
// Recompute rects after letter spacing
wordRects = determineRealRects(paragEl, '.BRwordElement');
const spaceRects = determineRealRects(paragEl, '.BRspace');
const ocrLines = $(ocrParagraph).find("LINE[coords]").toArray();
const lineEls = Array.from(paragEl.querySelectorAll('.BRlineElement'));
let ySoFar = paragTop;
for (const [ocrLine, lineEl] of zip(ocrLines, lineEls)) {
// shift words using marginLeft to align with the correct x position
const words = $(ocrLine).find("WORD").toArray();
// const ocrLineLeft = Math.min(...words.map(w => parseFloat($(w).attr("coords").split(',')[0])));
let xSoFar = this.rtl ? paragRight : paragLeft;
for (const [ocrWord, wordEl] of zip(words, lineEl.querySelectorAll('.BRwordElement'))) {
// start of line, need to compute the offset relative to the OCR words
const wordRect = wordRects.get(wordEl);
const [ocrLeft, , ocrRight ] = $(ocrWord).attr("coords").split(',').map(parseFloat);
const diff = (this.rtl ? -(ocrRight - xSoFar) : ocrLeft - xSoFar);
if (wordEl.previousElementSibling) {
const space = wordEl.previousElementSibling;
space.style.letterSpacing = `${diff - spaceRects.get(space).width}px`;
} else {
wordEl.style[this.rtl ? 'paddingRight' : 'paddingLeft'] = `${diff}px`;
}
if (this.rtl) xSoFar -= diff + wordRect.width;
else xSoFar += diff + wordRect.width;
}
// And also fix y position
const ocrLineTop = Math.min(...words.map(w => parseFloat($(w).attr("coords").split(',')[3])));
const diff = ocrLineTop - ySoFar;
if (lineEl.previousElementSibling) {
lineEl.previousElementSibling.style.lineHeight = `${diff}px`;
ySoFar += diff;
}
}
// The last line will have a line height subtracting from the paragraph height
lineEls[lineEls.length - 1].style.lineHeight = `${paragBottom - ySoFar}px`;
// Edge does not include a newline for some reason when copying/pasting the <p> els
paragEl.appendChild(document.createElement('br'));
return paragEl;
}
}
export class BookreaderWithTextSelection extends BookReader {
init() {
const options = Object.assign({}, DEFAULT_OPTIONS, this.options.plugins.textSelection);
if (options.enabled) {
this.textSelectionPlugin = new TextSelectionPlugin(options, this.options.vars, this.pageProgression);
// Write this back; this way the plugin is the source of truth, and BR just
// contains a reference to it.
this.options.plugins.textSelection = options;
this.textSelectionPlugin.init();
new SelectionObserver('.BRtextLayer', (selectEvent) => {
// Track how often selection is used
if (selectEvent == 'started') {
this.archiveAnalyticsSendEvent?.('BookReader', 'SelectStart');
// Set a class on the page to avoid hiding it when zooming/etc
this.refs.$br.find('.BRpagecontainer--hasSelection').removeClass('BRpagecontainer--hasSelection');
$(window.getSelection().anchorNode).closest('.BRpagecontainer').addClass('BRpagecontainer--hasSelection');
}
}).attach();
}
super.init();
}
/**
* @param {number} index
*/
_createPageContainer(index) {
const pageContainer = super._createPageContainer(index);
// Disable if thumb mode; it's too janky
// .page can be null for "pre-cover" region
if (this.mode !== this.constModeThumb && pageContainer.page) {
this.textSelectionPlugin?.createTextLayer(pageContainer);
}
return pageContainer;
}
}
window.BookReader = BookreaderWithTextSelection;
export default BookreaderWithTextSelection;
/**
* @param {HTMLElement} parentEl
* @param {string} selector
* @returns {Map<Element, Rect>}
*/
function determineRealRects(parentEl, selector) {
const initals = {
position: parentEl.style.position,
visibility: parentEl.style.visibility,
top: parentEl.style.top,
left: parentEl.style.left,
transform: parentEl.style.transform,
};
parentEl.style.position = 'absolute';
parentEl.style.visibility = 'hidden';
parentEl.style.top = '0';
parentEl.style.left = '0';
parentEl.style.transform = 'none';
document.body.appendChild(parentEl);
const rects = new Map(
Array.from(parentEl.querySelectorAll(selector))
.map(wordEl => {
const origRect = wordEl.getBoundingClientRect();
return [wordEl, new Rect(
origRect.left + window.scrollX,
origRect.top + window.scrollY,
origRect.width,
origRect.height,
)];
})
);
document.body.removeChild(parentEl);
Object.assign(parentEl.style, initals);
return rects;
}
/**
* @param {HTMLElement} line
*/
function augmentLine(line) {
const words = $(line).find("WORD").toArray();
return {
ocrElement: line,
words,
firstWord: words[0],
lastWord: words[words.length - 1],
};
}
/**
* @template TFrom, TTo
* Generator version of map
* @param {Iterable<TFrom>} gen
* @param {function(TFrom): TTo} fn
* @returns {Iterable<TTo>}
*/
export function* genMap(gen, fn) {
for (const x of gen) yield fn(x);
}
/**
* @template T
* Generator that provides a sliding window of 3 elements,
* prev, current, and next.
* @param {Iterable<T>} gen
* @returns {Iterable<[T | undefined, T, T | undefined]>}
*/
export function* lookAroundWindow(gen) {
let prev = undefined;
let cur = undefined;
let next = undefined;
for (const x of gen) {
if (typeof cur !== 'undefined') {
next = x;
yield [prev, cur, next];
}
prev = cur;
cur = x;
next = undefined;
}
if (typeof cur !== 'undefined') {
yield [prev, cur, next];
}
}
/**
* @template T1, T2
* Lazy zip implementation to avoid importing lodash
* Expects iterators to be of the same length
* @param {Iterable<T1>} gen1
* @param {Iterable<T2>} gen2
* @returns {Iterable<[T1, T2]>}
*/
export function* zip(gen1, gen2) {
const it1 = gen1[Symbol.iterator]();
const it2 = gen2[Symbol.iterator]();
while (true) {
const r1 = it1.next();
const r2 = it2.next();
if (r1.done && r2.done) {
return;
}
if (r1.done || r2.done) {
throw new Error('zip: one of the iterators is done');
}
yield [r1.value, r2.value];
}
}
/**
* [left, bottom, right, top]
* @param {Array<[number, number, number, number]>} bounds
* @returns {[number, number, number, number]}
*/
function determineBounds(bounds) {
let leftMost = Infinity;
let bottomMost = -Infinity;
let rightMost = -Infinity;
let topMost = Infinity;
for (const [left, bottom, right, top] of bounds) {
leftMost = Math.min(leftMost, left);
bottomMost = Math.max(bottomMost, bottom);
rightMost = Math.max(rightMost, right);
topMost = Math.min(topMost, top);
}
return [leftMost, bottomMost, rightMost, topMost];
}
/**
* Recursively traverses the XML tree and adds coords
* which are the bounding box of all child coords
* @param {Element} xmlEl
*/
function recursivelyAddCoords(xmlEl) {
if ($(xmlEl).attr('coords') || !xmlEl.children) {
return;
}
const children = $(xmlEl).children().toArray();
if (children.length === 0) {
return;
}
for (const child of children) {
recursivelyAddCoords(child);
}
const childCoords = [];
for (const child of children) {
if (!$(child).attr('coords')) continue;
childCoords.push($(child).attr('coords').split(',').map(parseFloat));
}
const boundingCoords = determineBounds(childCoords);
if (Math.abs(boundingCoords[0]) != Infinity) {
$(xmlEl).attr('coords', boundingCoords.join(','));
}
}
/**
* Basically a polyfill for the native DOMRect class
*/
class Rect {
/**
* @param {number} x
* @param {number} y
* @param {number} width
* @param {number} height
*/
constructor(x, y, width, height) {
this.x = x;
this.y = y;
this.width = width;
this.height = height;
}
get right() { return this.x + this.width; }
get bottom() { return this.y + this.height; }
get top() { return this.y; }
get left() { return this.x; }
}