mwoffliner
Version:
MediaWiki ZIM scraper
229 lines • 10.8 kB
JavaScript
import * as domino from 'domino';
import urlJoin from 'url-join';
import * as logger from '../Logger.js';
import { MobileRenderer } from './abstractMobile.render.js';
import { getStrippedTitleFromHtml } from '../util/misc.js';
const THUMB_WIDTH_REGEX = /\/(\d+)px-[^/]+$/;
const THUMB_MAX_DIMENSION = 320;
// Represent 'https://{wikimedia-wiki}/api/rest_v1/page/mobile-html/'
export class WikimediaMobileRenderer extends MobileRenderer {
constructor() {
super();
}
getStrippedTitle(renderOpts) {
const { data, articleId } = renderOpts;
const strippedTitle = getStrippedTitleFromHtml(data);
return strippedTitle || articleId.replace('_', ' ');
}
async render(renderOpts) {
try {
const result = [];
const { data, articleId, _moduleDependencies, dump } = renderOpts;
const articleDetail = await renderOpts.articleDetailXId.get(articleId);
const displayTitle = this.getStrippedTitle(renderOpts);
if (data) {
const moduleDependenciesFiltered = super.filterWikimediaMobileModules(_moduleDependencies);
let mediaDependenciesVal;
let videoDependenciesVal;
let imageDependenciesVal;
let subtitlesVal;
const mobileHTML = domino.createDocument(data);
const finalHTMLMobile = await this.pipeMobileTransformations(mobileHTML, this.INTERNAL.convertLazyLoadToImages, this.removeEditContainer, this.removeHiddenClass, this.INTERNAL.unhideSections, async (doc) => {
const { finalHTML, subtitles, mediaDependencies, videoDependencies, imageDependencies } = await super.processHtml(doc.documentElement.outerHTML, dump, articleId, articleDetail, moduleDependenciesFiltered, super.templateMobileArticle.bind(this));
mediaDependenciesVal = mediaDependencies;
imageDependenciesVal = imageDependencies;
videoDependenciesVal = videoDependencies;
subtitlesVal = subtitles;
return domino.createDocument(finalHTML);
}, this.restoreLinkDefaults);
result.push({
articleId,
displayTitle,
html: finalHTMLMobile.documentElement.outerHTML,
mediaDependencies: mediaDependenciesVal,
videoDependencies: videoDependenciesVal,
imageDependencies: imageDependenciesVal,
moduleDependencies: moduleDependenciesFiltered,
staticFiles: this.staticFilesListMobile,
subtitles: subtitlesVal,
});
return result;
}
else {
throw new Error(`Cannot render [${data}] into an article`);
}
}
catch (err) {
logger.error(err.message);
throw err;
}
}
async pipeMobileTransformations(value, ...fns) {
let result = value;
for (const fn of fns) {
result = fn(await result);
}
return result;
}
removeEditContainer(doc) {
const editContainers = doc.querySelectorAll('.pcs-edit-section-link-container');
editContainers.forEach((elem) => {
elem.remove();
});
return doc;
}
calculateImageDimensions(span) {
// These are the attributes that were "prepared" for us by the mobile-html endpoint.
const preparedData = {
src: span.getAttribute('data-src'),
width: parseInt(span.getAttribute('data-width') || '0', 10),
height: parseInt(span.getAttribute('data-height') || '0', 10),
};
if (preparedData.width === 0 || preparedData.height === 0) {
return { preparedData: null, originalData: null, maxData: null };
}
// Calculate the ratio so we know if we're scaling down in the width or height dimension.
const widthHeightRatio = preparedData.width / preparedData.height;
const scaleUsingHeight = widthHeightRatio > 1.0;
// The data-data-file-original-src attribute is the URL of the image that was used in the original article.
// It is preferred over the data-src attribute, which is a "mobile" image that may be scaled up in order to
// be "full width" on mobile devices. However, if the mobile API didn't scale the image up, then the
// data-data-file-original-src attribute will be missing, and we should use the data-src.
// See https://github.com/openzim/mwoffliner/issues/1925.
let originalData;
const originalSrc = span.getAttribute('data-data-file-original-src');
if (originalSrc) {
// Try to match against an image URL with a width in it.
const match = THUMB_WIDTH_REGEX.exec(originalSrc);
if (match) {
const originalWidth = parseInt(match[1], 10);
originalData = {
src: originalSrc,
width: originalWidth,
height: Math.round(originalWidth / widthHeightRatio),
};
}
}
let maxData;
if (scaleUsingHeight) {
maxData = {
src: null,
width: Math.round(THUMB_MAX_DIMENSION * widthHeightRatio),
height: THUMB_MAX_DIMENSION,
};
}
else {
maxData = {
src: null,
width: THUMB_MAX_DIMENSION,
height: Math.round(THUMB_MAX_DIMENSION / widthHeightRatio),
};
}
return {
preparedData,
originalData,
maxData,
};
}
convertLazyLoadToImagesImpl(doc) {
const protocol = 'https://';
const spans = doc.querySelectorAll('.pcs-lazy-load-placeholder');
spans.forEach((span) => {
// Create a new img element that will replace the placeholder span.
const img = doc.createElement('img');
const { preparedData, originalData, maxData } = this.calculateImageDimensions(span);
if (preparedData === null) {
// The metadata for the lazy-loading span is something that we don't understand. Just copy the
// data-src attribute to the img element and hope for the best.
img.src = urlJoin(protocol, span.getAttribute('data-src'));
img.setAttribute('decoding', 'async');
img.className = span.getAttribute('data-class');
}
else {
const widthToData = {
[preparedData.width]: preparedData,
[maxData.width]: maxData,
[originalData?.width || 0]: originalData,
};
const minWidth = originalData ? Math.min(preparedData.width, maxData.width, originalData?.width) : Math.min(preparedData.width, maxData.width);
let selectedData = widthToData[minWidth];
if (selectedData === maxData) {
// We've decided to scale down the image. Use URL hacking to create an image that scales to the size we want.
if (originalData) {
const match = THUMB_WIDTH_REGEX.exec(originalData.src);
if (match) {
selectedData.src = originalData.src.replace(`${match[1]}px`, `${selectedData.width}px`);
}
}
else {
// No original src, or original src cannot be URL hacked.
const match = THUMB_WIDTH_REGEX.exec(preparedData.src);
if (match) {
selectedData.src = preparedData.src.replace(`${match[1]}px`, `${selectedData.width}px`);
}
}
}
if (selectedData.src === null) {
// We couldn't find a URL to hack, so use the smaller of the original or prepared data.
if (!originalData) {
selectedData = preparedData;
}
else {
const newMinWidth = Math.min(preparedData.width, originalData.width);
selectedData = widthToData[newMinWidth];
}
}
img.src = urlJoin(protocol, selectedData.src);
img.setAttribute('decoding', 'async');
img.width = selectedData.width;
img.height = selectedData.height;
img.className = span.getAttribute('data-class');
}
// Replace the span with the img element
span.parentNode.replaceChild(img, span);
});
return doc;
}
removeHiddenClass(doc) {
const pcsSectionHidden = 'pcs-section-hidden';
const hiddenSections = doc.querySelectorAll(`.${pcsSectionHidden}`);
hiddenSections.forEach((section) => {
section.classList.remove(pcsSectionHidden);
});
return doc;
}
unhideSectionsImpl(doc) {
const sections = doc.querySelectorAll('section');
Array.from(sections).forEach((section) => {
// Domino doesn't allow us to easily manipulate specific styles. Rather than trying to parse
// the style attribute and remove display: none, we just clobber the whole thing.
section.style = '';
});
return doc;
}
restoreLinkDefaults(doc) {
const supElements = doc.querySelectorAll('sup');
Array.from(supElements).forEach((sup) => {
const anchor = doc.createElement('a');
const mwRefLinkTextElement = sup.querySelector('.mw-reflink-text');
let mwRefLinkText = '';
if (mwRefLinkTextElement) {
mwRefLinkText = mwRefLinkTextElement.textContent || '';
}
const existedAnchor = sup.querySelector('.reference-link');
if (existedAnchor?.getAttribute('href')) {
anchor.setAttribute('href', existedAnchor.getAttribute('href'));
}
anchor.className = 'reference-link';
anchor.textContent = mwRefLinkText;
sup.innerHTML = '';
sup.appendChild(anchor);
});
return doc;
}
INTERNAL = {
convertLazyLoadToImages: this.convertLazyLoadToImagesImpl.bind(this),
unhideSections: this.unhideSectionsImpl.bind(this),
};
}
//# sourceMappingURL=wikimedia-mobile.renderer.js.map