@elibrary-inno/bookreader
Version:
The Internet Archive BookReader.
102 lines (93 loc) • 3.17 kB
JavaScript
/**
* Class to manage a 'chunk' (approximately a paragraph) of text on a page.
*/
export default class PageChunk {
/**
* @param {number} leafIndex
* @param {number} chunkIndex
* @param {string} text
* @param {DJVURect[]} lineRects
*/
constructor(leafIndex, chunkIndex, text, lineRects) {
this.leafIndex = leafIndex;
this.chunkIndex = chunkIndex;
this.text = text;
this.lineRects = lineRects;
}
/**
* @param {string} server
* @param {string} bookPath
* @param {number} leafIndex
* @return {Promise<PageChunk[]>}
*/
static async fetch(server, bookPath, leafIndex) {
const chunks = await $.ajax({
type: 'GET',
url: `https://${server}/BookReader/BookReaderGetTextWrapper.php`,
dataType:'jsonp',
cache: true,
data: {
path: `${bookPath}_djvu.xml`,
page: leafIndex
}
});
return PageChunk._fromTextWrapperResponse(leafIndex, chunks);
}
/**
* Convert the response from BookReaderGetTextWrapper.php into a {@link PageChunk} instance
* @param {number} leafIndex
* @param {Array<[String, ...DJVURect[]]>} chunksResponse
* @return {PageChunk[]}
*/
static _fromTextWrapperResponse(leafIndex, chunksResponse) {
return chunksResponse.map((c, i) => {
const correctedLineRects = PageChunk._fixChunkRects(c.slice(1));
const correctedText = PageChunk._removeDanglingHyphens(c[0]);
return new PageChunk(leafIndex, i, correctedText, correctedLineRects);
});
}
/**
* @private
* Sometimes the first rectangle will be ridiculously wide/tall. Find those and fix them
* *NOTE*: Modifies the original array and returns it.
* *NOTE*: This should probably be fixed on the petabox side, and then removed here
* Has 2 problems:
* - If the rect is the last rect on the page (and hence the only rect in the array),
* the rect's size isn't fixed
* - Because this relies on the second rect, there's a chance it won't be the right
* width
* @param {DJVURect[]} rects
* @return {DJVURect[]}
*/
static _fixChunkRects(rects) {
if (rects.length < 2) return rects;
const [firstRect, secondRect] = rects;
const [left, bottom, right] = firstRect;
const width = right - left;
const secondHeight = secondRect[1] - secondRect[3];
const secondWidth = secondRect[2] - secondRect[0];
const secondRight = secondRect[2];
if (width > secondWidth * 30) {
// Set the end to be the same
firstRect[2] = secondRight;
// And the top to be the same height
firstRect[3] = bottom - secondHeight;
}
return rects;
}
/**
* Remove "dangling" hyphens from read aloud text to avoid TTS stuttering
* @param {string} text
* @return {string}
*/
static _removeDanglingHyphens(text) {
// Some books mis-OCR a dangling hyphen as a ¬ (mathematical not sign) . Since in math
// the not sign should not appear followed by a space, we think we can safely assume
// this should be replaced.
return text.replace(/[-¬]\s+/g, '');
}
}
/**
* @typedef {[number, number, number, number]} DJVURect
* coords are in l,b,r,t order
*/