UNPKG

mwoffliner

Version:

MediaWiki ZIM scraper

github.com/openzim/mwoffliner

openzim/mwoffliner

476 lines • 16.7 kB

JavaScript

import crypto from 'crypto'; import domino from 'domino'; import Downloader from '../Downloader.js'; import countryLanguage from '@ladjs/country-language'; import fs from 'fs'; import path from 'path'; import { mkdirp } from 'mkdirp'; import os from 'os'; import pathParser from 'path'; import { StringItem } from '@openzim/libzim'; import { config } from '../config.js'; import * as logger from '../Logger.js'; import { LATEX_IMAGE_URL_REGEX, FANDOM_IMAGE_URL_REGEX, IMAGE_THUMB_URL_REGEX, FIND_HTTP_REGEX, BITMAP_IMAGE_MIME_REGEX, IMAGE_MIME_REGEX, WEBP_CANDIDATE_IMAGE_MIME_TYPE, } from './const.js'; import { fileURLToPath } from 'url'; import { zimCreatorMutex } from '../mutex.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); let tmpDirectory = ''; export function isValidEmail(email) { const emailRegex = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/; return emailRegex.test(email); } export function lcFirst(str) { str += ''; const f = str.charAt(0).toLowerCase(); return f + str.substr(1); } export function ucFirst(str) { str += ''; const f = str.charAt(0).toUpperCase(); return f + str.substr(1); } function _decodeURIComponent(uri) { try { return decodeURIComponent(uri); } catch (error) { logger.warn(error); return uri; } } export { _decodeURIComponent as decodeURIComponent }; export function touch(paths) { const currentDate = Date.now(); paths = paths instanceof Array ? paths : [paths]; paths.forEach((path) => { fs.utimes(path, currentDate, currentDate, () => null); }); } export function getFullUrl(url, baseUrl) { return new URL(url, baseUrl).toString(); } export function getSizeFromUrl(url) { let mult; let width; const widthMatch = url.match(/[/-]([0-9]+)px-/); if (widthMatch) { width = Number(widthMatch[1]); } else { const multMatch = url.match(/-([0-9.]+)x\./); if (multMatch) { mult = Number(multMatch[1]); } } return { mult, width }; } export function randomString(len) { let str = ''; const charSet = 'abcdefghijklmnopqrstuvwxyz0123456789'; for (let i = 0; i < len; i += 1) { const randomPoz = Math.floor(Math.random() * charSet.length); str += charSet.substring(randomPoz, randomPoz + 1); } return str; } export function mkdirPromise(path) { try { return mkdirp(path, { recursive: true }); } catch (err) { return err; } } export function writeFilePromise(path, content, encoding = 'utf8') { return new Promise((resolve, reject) => { fs.writeFile(path, content, encoding, (err) => { if (err) { reject(err); } else { resolve(null); } }); }); } export function readFilePromise(path, encoding = 'utf8') { return new Promise((resolve, reject) => { fs.readFile(path, encoding, (err, content) => { if (err) { reject(err); } else { resolve(content); } }); }); } export function contains(arr, value) { return arr.some((v) => v === value); } /* * Move 'from'.childNodes to 'to' adding them before 'beforeNode' * If 'beforeNode' is null, the nodes are appended at the end. */ export function migrateChildren(from, to, beforeNode) { if (beforeNode === undefined) { beforeNode = null; } while (from.firstChild) { to.insertBefore(from.firstChild, beforeNode); } } export function getStringsForLang(language, fallbackLanguage = 'en') { let strings = {}; // Read fallbackLanguage first, so it initially populates the strings. Then, read the primary language file, // overridding default strings with the values from the primary language. for (const lang of [fallbackLanguage, language]) { try { const fileContents = fs.readFileSync(path.join(__dirname, `../../translation/${lang}.json`)).toString(); const langStrings = JSON.parse(fileContents); delete langStrings['@metadata']; strings = { ...strings, ...langStrings }; } catch { logger.warn(`Couldn't find strings file for [${lang}]`); } } return strings; } export function interpolateTranslationString(str, parameters) { let newString = str; for (const key of Object.keys(parameters)) { newString = newString.replace(new RegExp(`\\$\\{${key}\\}`, 'g'), parameters[key]); } return newString; } export async function saveStaticFiles(staticFiles, zimCreator) { try { staticFiles.forEach(async (file) => { const staticFilesContent = await readFilePromise(pathParser.resolve(__dirname, `../../res/${file}`)); let zimPath; let mimetype; if (file.endsWith('.ttf')) { zimPath = anyPath('ttf', file); mimetype = 'font/ttf'; } else if (file.endsWith('.svg')) { zimPath = anyPath('svg', file); mimetype = 'image/svg+xml'; } else if (file.endsWith('.css')) { zimPath = cssPath(file); mimetype = 'text/css'; } else { zimPath = jsPath(file); mimetype = 'application/javascript'; } const article = new StringItem(`${config.output.dirs.res}/${zimPath}`, mimetype, null, { FRONT_ARTICLE: 0 }, staticFilesContent); await zimCreatorMutex.runExclusive(() => zimCreator.addItem(article)); }); } catch (err) { logger.error(err); } } export function getStaticFiles(jsStaticFiles, cssStaticFiles) { jsStaticFiles = jsStaticFiles.map((jsFile) => jsFile.concat('.js')); cssStaticFiles = cssStaticFiles.map((cssFile) => cssFile.concat('.css')); return jsStaticFiles.concat(cssStaticFiles); } export function anyPath(ext, path, subDirectory = '') { const regex = new RegExp(`(\\.${ext})?$`); return `${subDirectory ? `${subDirectory}/` : ''}${path.replace(regex, '')}.${ext}`; } export function cssPath(css, subDirectory = '') { return anyPath('css', css, subDirectory); } export function jsPath(js, subDirectory = '') { const path = isNodeModule(js) ? normalizeModule(js) : js; return `${subDirectory ? `${subDirectory}/` : ''}${path.replace(/(\.js)?$/, '')}.js`; } export function genHeaderCSSLink(config, css, articleId, subDirectory = '') { const slashesInUrl = articleId.split('/').length - 1; const upStr = slashesInUrl ? '../'.repeat(slashesInUrl) : './'; return `<link href="${upStr}${cssPath(css, subDirectory)}" rel="stylesheet" type="text/css"/>`; } export function genHeaderScript(config, js, articleId, subDirectory = '', attributes = '') { const slashesInUrl = articleId.split('/').length - 1; const upStr = slashesInUrl ? '../'.repeat(slashesInUrl) : './'; const path = isNodeModule(js) ? normalizeModule(js) : js; return `<script ${attributes} src="${upStr}${jsPath(path, subDirectory)}"></script>`; } export function genCanonicalLink(config, webUrl, articleId) { return `<link rel="canonical" href="${webUrl}${encodeURIComponent(articleId)}" />`; } export function getDumps(format) { let dumps; if (format) { if (format instanceof Array) { dumps = []; format.forEach((value) => { dumps.push(value === true ? '' : value); }); } else if (format !== true) { dumps = [format]; } } else { dumps = ['']; } return dumps; } export function getIso3(langIso2) { return new Promise((resolve, reject) => { countryLanguage.getLanguage(langIso2, (error, language) => { if (error || !language.iso639_3) { reject(error); } else { resolve(language.iso639_3); } }); }); } /* Internal path/url functions */ export function getMediaBase(url, escape) { const decodedUrl = decodeURI(url); let parts; let filename; let filedir = ''; // Image thumbs if ((parts = IMAGE_THUMB_URL_REGEX.exec(decodedUrl)) !== null) { // Remove trailing / in parts[4] if possible parts[4] = parts[4] ? parts[4].substring(0, parts[4].length - 1) : ''; // Most common case if (!parts[4] || parts[4].length <= parts[6].length) { filename = parts[6]; } // To handle /...px-thumbnail.jpg use case else { filename = parts[4] + (parts[7] || ''); } filedir = parts[1]; } // Latex (equations) else if ((parts = LATEX_IMAGE_URL_REGEX.exec(decodedUrl)) !== null) { filename = parts[2] + '.svg'; filedir = parts[1]; } // Fandom has even an other URL scheme else if ((parts = FANDOM_IMAGE_URL_REGEX.exec(decodedUrl)) !== null) { filename = parts[2]; filedir = parts[1]; } // Default behaviour (make a hash of the URL) else { filename = crypto.createHash('md5').update(decodedUrl).digest('hex') + path.extname(new URL(url).pathname); } if (filedir) { filedir = crypto.createHash('md5').update(filedir).digest('hex') + '/'; } if (escape) { filename = encodeURIComponent(filename); } return `${config.output.dirs.assets}/${filedir}${filename}`; } /** * This function extracts the title from an HTML string and returns it stripped of any HTML tags. * * @param {string} html - The `html` parameter is a string that represents an HTML document. The * function extracts the title of the document from this HTML string. * * @returns a string that represents the title of an HTML document with all HTML tags removed. If the * title cannot be found in the input HTML string, an empty string is returned. */ export function getStrippedTitleFromHtml(html) { let [, , title = ''] = html.match(/<title( [^>]*)?>(.*)<[/]title>/i) || []; if (!title) { const doc = domino.createDocument(html); const titleEl = doc.querySelector('title'); title = titleEl ? titleEl.textContent : ''; } return title.replace(/<[^>]*>?/gm, ''); } export function zip(...args) { const len = Math.max(...args.map((arr) => arr.length)); return ',' .repeat(len) .split(',') .map((_, i) => { return args.map((arr) => arr[i]); }); } export function deDup(_arr, getter) { const arr = _arr.sort((a, b) => (getter(a) < getter(b) ? -1 : 1)); return arr.filter((item, index, arr) => { if (index + 1 === arr.length) { return true; } return getter(item) !== getter(arr[index + 1]); }); } export function getRelativeFilePath(parentArticleId, fileBase) { const slashesInUrl = parentArticleId.split('/').length - 1; const upStr = slashesInUrl ? '../'.repeat(slashesInUrl) : './'; return upStr + fileBase; } export function normalizeModule(path) { return path.replace('../node_modules', 'node_module'); } export function isNodeModule(path) { return path.startsWith('../node_module'); } export function objToQueryString(obj) { const str = []; for (const p in obj) { // eslint-disable-next-line no-prototype-builtins if (obj.hasOwnProperty(p) && typeof obj[p] !== 'undefined') { str.push(encodeURIComponent(p) + '=' + encodeURIComponent(obj[p])); } } return str.join('&'); } export function sanitizeString(str) { return str.replace(/[&<>"'*=//]/g, ' '); } // We will need the encoded URL on article load so that we can set the hrefs of anchor tag correctly, // but we must not encode the '/' character or else relative links may fail export function encodeArticleIdForZimHtmlUrl(articleId) { return articleId && encodeURIComponent(articleId.startsWith('/') ? `./${articleId}` : articleId).replace(/%2F/g, '/'); } export function ensureTrailingChar(input, trailingChar) { const pattern = `([^\\${trailingChar}])$`; const rx = new RegExp(pattern); return input.replace(rx, '$1' + trailingChar); } export function stripHttpFromUrl(url) { return url.replace(FIND_HTTP_REGEX, ''); } export function isImageMimeType(mimeType) { return IMAGE_MIME_REGEX.test(mimeType); } export function isBitmapImageMimeType(mimeType) { return BITMAP_IMAGE_MIME_REGEX.test(mimeType); } export function isWebpCandidateImageMimeType(content_type) { return WEBP_CANDIDATE_IMAGE_MIME_TYPE.test(content_type); } export function cleanupAxiosError(err) { return { name: err.name, message: err.message, url: err.config?.url, status: err.response?.status, responseType: err.config?.responseType, data: err.response?.data }; } async function downloadListByUrl(url) { const fileName = url.split('/').slice(-1)[0]; const { data: contentStream } = await Downloader.request({ url, method: 'GET', ...Downloader.streamRequestOptions }); const filePath = path.join(await getTmpDirectory(), fileName); const writeStream = fs.createWriteStream(filePath); await new Promise((resolve, reject) => { contentStream .pipe(writeStream) .on('error', (err) => reject(err)) .on('close', resolve); }); return filePath; } export async function extractArticleList(articleList) { const list = await Promise.all(articleList .split(',') .filter((n) => n) .map(async (part) => { let item = part.trim(); if (item.indexOf('http') === 0) { let url; try { url = new URL(item); } catch { // URL is not valid. Continue processing } if (url && url.href) { try { item = await downloadListByUrl(url.href); } catch { throw new Error(`Failed to read articleList from URL: ${url.href}`); } } } if (fs.existsSync(item)) { item = fs .readFileSync(item) .toString() .split('\n') .map((a) => a.replace(/\r/gm, '')) .filter((a) => a); } return item; })); return list.flat(1); } export async function getTmpDirectory() { if (!tmpDirectory) { tmpDirectory = path.resolve(os.tmpdir(), `mwoffliner-${Date.now()}`); try { logger.info(`Creating temporary directory [${tmpDirectory}]`); await mkdirPromise(tmpDirectory); } catch (err) { logger.error('Failed to create temporary directory, exiting', err); throw err; } } return tmpDirectory; } export function truncateUtf8Bytes(text, maxBytes) { // Truncate text to maxBytes bytes once encoded to UTF-8 ; takes into account multi-bytes characters, avoiding to split // in the middle of a character, trying to do this in an efficient manner with binary search const encoder = new TextEncoder(); const encoded = encoder.encode(text); if (encoded.length <= maxBytes) { return text; } // Binary search to find the maximum substring that fits in maxBytes let low = 0; let high = text.length; while (low < high) { const mid = Math.floor((low + high) / 2); const slice = text.slice(0, mid); const sliceBytes = encoder.encode(slice).length; if (sliceBytes <= maxBytes) { low = mid + 1; } else { high = mid; } } return text.slice(0, low - 1); } /** * Parse a Retry-After header into a timestamp (ms since epoch). * @param {string} headerValue The value of the Retry-After header. * @returns {number|null} Epoch time in ms at which you can retry, or null if invalid. */ export function parseRetryAfterHeader(headerValue) { if (!headerValue) return null; // Trim whitespace just in case const value = headerValue.trim(); // Case 1: delta-seconds (numeric) if (/^\d+$/.test(value)) { const seconds = parseInt(value, 10); if (isNaN(seconds)) return null; return Date.now() + seconds * 1000; } // Case 2: HTTP-date const date = Date.parse(value); if (!isNaN(date)) { return date; } // Could not parse return null; } //# sourceMappingURL=misc.js.map