UNPKG

mwoffliner

Version:
399 lines 14.3 kB
import crypto from 'crypto'; import domino from 'domino'; import countryLanguage from '@ladjs/country-language'; import fs from 'fs'; import path from 'path'; import mkdirp from 'mkdirp'; import os from 'os'; import pathParser from 'path'; import { ZimArticle } from '@openzim/libzim'; import { config } from '../config.js'; import * as logger from '../Logger.js'; import { LATEX_IMAGE_URL_REGEX, FANDOM_IMAGE_URL_REGEX, WIKIHIERO_IMAGE_URL_REGEX, IMAGE_THUMB_URL_REGEX, FIND_HTTP_REGEX, BITMAP_IMAGE_MIME_REGEX, IMAGE_MIME_REGEX, WEBP_CANDIDATE_IMAGE_MIME_TYPE, } from './const.js'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); let tmpDirectory = ''; export function isValidEmail(email) { const emailRegex = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/; return emailRegex.test(email); } export function lcFirst(str) { str += ''; const f = str.charAt(0).toLowerCase(); return f + str.substr(1); } export function ucFirst(str) { str += ''; const f = str.charAt(0).toUpperCase(); return f + str.substr(1); } function _decodeURIComponent(uri) { try { return decodeURIComponent(uri); } catch (error) { logger.warn(error); return uri; } } export { _decodeURIComponent as decodeURIComponent }; export function touch(paths) { const currentDate = Date.now(); paths = paths instanceof Array ? paths : [paths]; paths.forEach((path) => { fs.utimes(path, currentDate, currentDate, () => null); }); } export function getFullUrl(url, baseUrl) { return new URL(url, baseUrl).toString(); } export function getSizeFromUrl(url) { let mult; let width; const widthMatch = url.match(/[\/-]([0-9]+)px-/); if (widthMatch) { width = Number(widthMatch[1]); } else { const multMatch = url.match(/-([0-9.]+)x\./); if (multMatch) { mult = Number(multMatch[1]); } } return { mult, width }; } export function randomString(len) { let str = ''; const charSet = 'abcdefghijklmnopqrstuvwxyz0123456789'; for (let i = 0; i < len; i += 1) { const randomPoz = Math.floor(Math.random() * charSet.length); str += charSet.substring(randomPoz, randomPoz + 1); } return str; } export function mkdirPromise(path) { try { return mkdirp(path, { recursive: true }); } catch (err) { return err; } } export function writeFilePromise(path, content, encoding = 'utf8') { return new Promise((resolve, reject) => { fs.writeFile(path, content, encoding, (err) => { if (err) { reject(err); } else { resolve(null); } }); }); } export function readFilePromise(path, encoding = 'utf8') { return new Promise((resolve, reject) => { fs.readFile(path, encoding, (err, content) => { if (err) { reject(err); } else { resolve(content); } }); }); } export function contains(arr, value) { return arr.some((v) => v === value); } /* * Move 'from'.childNodes to 'to' adding them before 'beforeNode' * If 'beforeNode' is null, the nodes are appended at the end. */ export function migrateChildren(from, to, beforeNode) { if (beforeNode === undefined) { beforeNode = null; } while (from.firstChild) { to.insertBefore(from.firstChild, beforeNode); } } export function getStringsForLang(language, fallbackLanguage = 'en') { let strings = {}; // Read fallbackLanguage first, so it initially populates the strings. Then, read the primary language file, // overridding default strings with the values from the primary language. for (const lang of [fallbackLanguage, language]) { try { const fileContents = fs.readFileSync(path.join(__dirname, `../../translation/${lang}.json`)).toString(); const langStrings = JSON.parse(fileContents); delete langStrings['@metadata']; strings = { ...strings, ...langStrings }; } catch (err) { logger.warn(`Couldn't find strings file for [${lang}]`); } } return strings; } export function interpolateTranslationString(str, parameters) { let newString = str; for (const key of Object.keys(parameters)) { newString = newString.replace(new RegExp(`\\$\\{${key}\\}`, 'g'), parameters[key]); } return newString; } export async function saveStaticFiles(staticFiles, zimCreator) { try { staticFiles.forEach(async (file) => { const staticFilesContent = await readFilePromise(pathParser.resolve(__dirname, `../../res/${file}`)); const article = new ZimArticle({ url: file.endsWith('.css') ? cssPath(file) : jsPath(file), data: staticFilesContent, ns: '-' }); zimCreator.addArticle(article); }); } catch (err) { logger.error(err); } } export function getStaticFiles(jsStaticFiles, cssStaticFiles) { jsStaticFiles = jsStaticFiles.map((jsFile) => jsFile.concat('.js')); cssStaticFiles = cssStaticFiles.map((cssFile) => cssFile.concat('.css')); return jsStaticFiles.concat(cssStaticFiles); } export function cssPath(css, subDirectory = '') { return `${subDirectory ? `${subDirectory}/` : ''}${css.replace(/(\.css)?$/, '')}.css`; } export function jsPath(js, subDirectory = '') { const path = isNodeModule(js) ? normalizeModule(js) : js; return `${subDirectory ? `${config.output.dirs.mediawiki}/` : ''}${path.replace(/(\.js)?$/, '')}.js`; } export function genHeaderCSSLink(config, css, articleId, subDirectory = '') { const resourceNamespace = '-'; const slashesInUrl = articleId.split('/').length - 1; const upStr = '../'.repeat(slashesInUrl + 1); return `<link href="${upStr}${resourceNamespace}/${cssPath(css, subDirectory)}" rel="stylesheet" type="text/css"/>`; } export function genHeaderScript(config, js, articleId, subDirectory = '', attributes = '') { const resourceNamespace = '-'; const slashesInUrl = articleId.split('/').length - 1; const upStr = '../'.repeat(slashesInUrl + 1); const path = isNodeModule(js) ? normalizeModule(js) : js; return `<script ${attributes} src="${upStr}${resourceNamespace}/${jsPath(path, subDirectory)}"></script>`; } export function genCanonicalLink(config, webUrl, articleId) { return `<link rel="canonical" href="${webUrl}${encodeURIComponent(articleId)}" />`; } export function getDumps(format) { let dumps; if (format) { if (format instanceof Array) { dumps = []; format.forEach((value) => { dumps.push(value === true ? '' : value); }); } else if (format !== true) { dumps = [format]; } } else { dumps = ['']; } return dumps; } export function getIso3(langIso2) { return new Promise((resolve, reject) => { countryLanguage.getLanguage(langIso2, (error, language) => { if (error || !language.iso639_3) { reject(error); } else { resolve(language.iso639_3); } }); }); } /* Internal path/url functions */ export function getMediaBase(url, escape) { const decodedUrl = decodeURI(url); let parts; let filename; // Image thumbs if ((parts = IMAGE_THUMB_URL_REGEX.exec(decodedUrl)) !== null) { // Remove trailing / in parts[1] if possible parts[1] = parts[1] ? parts[1].substring(0, parts[1].length - 1) : ''; // Most common case if (!parts[1] || parts[1].length <= parts[3].length) { filename = parts[3]; } // To handle /...px-thumbnail.jpg use case else { filename = parts[1] + (parts[4] || ''); } } // Latex (equations) else if ((parts = LATEX_IMAGE_URL_REGEX.exec(decodedUrl)) !== null) { filename = parts[1] + '.svg'; } // WikiHiero hieroglyphs (betting there won't be a name conflict with main namespace pictures) else if ((parts = WIKIHIERO_IMAGE_URL_REGEX.exec(decodedUrl)) !== null) { filename = parts[1]; } // Fandom has even an other URL scheme else if ((parts = FANDOM_IMAGE_URL_REGEX.exec(decodedUrl)) !== null) { filename = parts[1]; } // Default behaviour (make a hash of the URL) else { filename = crypto.createHash('md5').update(decodedUrl).digest('hex') + path.extname(new URL(url).pathname); } return escape ? encodeURIComponent(filename) : filename; } /** * This function extracts the title from an HTML string and returns it stripped of any HTML tags. * * @param {string} html - The `html` parameter is a string that represents an HTML document. The * function extracts the title of the document from this HTML string. * * @returns a string that represents the title of an HTML document with all HTML tags removed. If the * title cannot be found in the input HTML string, an empty string is returned. */ export function getStrippedTitleFromHtml(html) { let [, , title = ''] = html.match(/<title( [^>]*)?>(.*)<[/]title>/i) || []; if (!title) { const doc = domino.createDocument(html); const titleEl = doc.querySelector('title'); title = titleEl ? titleEl.textContent : ''; } return title.replace(/<[^>]*>?/gm, ''); } export function zip(...args) { const len = Math.max(...args.map((arr) => arr.length)); return ',' .repeat(len) .split(',') .map((_, i) => { return args.map((arr) => arr[i]); }); } export function deDup(_arr, getter) { const arr = _arr.sort((a, b) => (getter(a) < getter(b) ? -1 : 1)); return arr.filter((item, index, arr) => { if (index + 1 === arr.length) { return true; } return getter(item) !== getter(arr[index + 1]); }); } export function getRelativeFilePath(parentArticleId, fileBase, resourceNamespace) { const slashesInUrl = parentArticleId.split('/').length - 1; const upStr = '../'.repeat(slashesInUrl + 1); const newUrl = `${upStr}${resourceNamespace}/` + fileBase; return newUrl; } export function normalizeModule(path) { return path.replace('../node_modules', 'node_module'); } export function isNodeModule(path) { return path.startsWith('../node_module'); } export function objToQueryString(obj) { const str = []; for (const p in obj) { if (obj.hasOwnProperty(p) && typeof obj[p] !== 'undefined') { str.push(encodeURIComponent(p) + '=' + encodeURIComponent(obj[p])); } } return str.join('&'); } export function sanitizeString(str) { return str.replace(/[&<>"'*=//]/g, ' '); } // We will need the encoded URL on article load so that we can set the hrefs of anchor tag correctly, // but we must not encode the '/' character or else relative links may fail export function encodeArticleIdForZimHtmlUrl(articleId) { return articleId && encodeURIComponent(articleId.startsWith('/') ? `./${articleId}` : articleId).replace(/%2F/g, '/'); } export function ensureTrailingChar(input, trailingChar) { const pattern = `([^\\${trailingChar}])$`; const rx = new RegExp(pattern); return input.replace(rx, '$1' + trailingChar); } export function stripHttpFromUrl(url) { return url.replace(FIND_HTTP_REGEX, ''); } export function isImageMimeType(mimeType) { return IMAGE_MIME_REGEX.test(mimeType); } export function isBitmapImageMimeType(mimeType) { return BITMAP_IMAGE_MIME_REGEX.test(mimeType); } export function isWebpCandidateImageMimeType(content_type) { return WEBP_CANDIDATE_IMAGE_MIME_TYPE.test(content_type); } export function cleanupAxiosError(err) { return { name: err.name, message: err.message, url: err.config?.url, status: err.response?.status, responseType: err.config?.responseType, data: err.response?.data }; } async function downloadListByUrl(url, downloader) { const fileName = url.split('/').slice(-1)[0]; const { data: contentStream } = await downloader.request({ url, method: 'GET', ...downloader.streamRequestOptions }); const filePath = path.join(await getTmpDirectory(), fileName); const writeStream = fs.createWriteStream(filePath); await new Promise((resolve, reject) => { contentStream .pipe(writeStream) .on('error', (err) => reject(err)) .on('close', resolve); }); return filePath; } export async function extractArticleList(articleList, downloader) { const list = await Promise.all(articleList .split(',') .filter((n) => n) .map(async (part) => { let item = part.trim(); if (item.indexOf('http') === 0) { let url; try { url = new URL(item); } catch (e) { // URL is not valid. Continue processing } if (url && url.href) { try { item = await downloadListByUrl(url.href, downloader); } catch (e) { throw new Error(`Failed to read articleList from URL: ${url.href}`); } } } if (fs.existsSync(item)) { item = fs .readFileSync(item) .toString() .split('\n') .map((a) => a.replace(/\r/gm, '')) .filter((a) => a); } return item; })); return list.flat(1); } export async function getTmpDirectory() { if (!tmpDirectory) { tmpDirectory = path.resolve(os.tmpdir(), `mwoffliner-${Date.now()}`); try { logger.info(`Creating temporary directory [${tmpDirectory}]`); await mkdirPromise(tmpDirectory); } catch (err) { logger.error('Failed to create temporary directory, exiting', err); throw err; } } return tmpDirectory; } //# sourceMappingURL=misc.js.map