@akashacms/plugins-dlassets
Version:
Automatically download assets referenced in links, images, etc
318 lines (277 loc) • 11.7 kB
JavaScript
/**
*
* Copyright 2018-2025 David Herron
*
* This file is part of AkashaCMS-dlassets (http://akashacms.com/).
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import path from 'node:path';
import util from 'node:util';
import url from 'node:url';
import fs, { promises as fsp } from 'node:fs';
import got from 'got';
import mime from 'mime';
import bs58 from 'bs58';
import akasha, {
Configuration,
CustomElement,
Munger,
PageProcessor
} from 'akasharender';
const mahabhuta = akasha.mahabhuta;
const __dirname = import.meta.dirname;
const pluginName = "@akashacms/plugins-dlassets";
export class DownloadAssetsPlugin extends akasha.Plugin {
constructor() {
super(pluginName);
}
#config;
configure(config, options) {
this.#config = config;
// this.config = config;
this.akasha = config.akasha;
this.options = options ? options : {};
this.options.config = config;
// console.log(`${pluginName} options ${util.inspect(options)} this.options ${util.inspect(this.options)}`);
// config.addPartialsDir(path.join(__dirname, 'partials'));
// config.addAssetsDir(path.join(__dirname, 'assets'));
config.addMahabhuta(mahabhutaArray(options, config, this.akasha, this));
}
get config() { return this.#config; }
}
export function mahabhutaArray(
options,
config, // ?: Configuration,
akasha, // ?: any,
plugin // ?: Plugin
) {
let ret = new mahabhuta.MahafuncArray(pluginName, options);
ret.addMahafunc(new ExternalImageDownloader(config, akasha, plugin));
ret.addMahafunc(new ExternalStylesheetDownloader(config, akasha, plugin));
ret.addMahafunc(new ExternalJavaScriptDownloader(config, akasha, plugin));
return ret;
};
const hrefsDownloaded = new Map();
// TODO
//
// Handle outputMode binary vs utf8
//
// Reorganize into this process
//
// 1. compute dlDir = path.join('/___dlassets', dlpath_host)
// 2. ensure that directory exists
// 3. start fetch
// 4. Throw error if it fails
// 5. Hash uHref.path making sure it is a legit pathname
// 6. Depending on response.headers.get('content-type)
// append a file extension to the encoded path
// 7. compute dlPath = path.join(dlDir, encoded-path-and-extension)
// 8. stream response.body to that file ensuring it is in correct mode
async function downloadAsset(config, options, href, uHref, outputMode) {
if (hrefsDownloaded.has(href)) {
// console.log(`downloadAsset cache-hit ${href}`);
return hrefsDownloaded.get(href);
}
// Set up the path for the image.
// We'll write this path into the <img> tag.
// We'll store the file into the corresponding file on-disk.
//
// We need to take care with certain characters in the path.
// For example, Amazon will use a file-name like 81yP%2B05t98L._SL1500_.jpg
// in its images. That '%' character causes problems when it's part
// of a URL. Cheerio doesn't do the right thing to encode this
// string correctly. What we'll do instead is hide characters that are
// known to be dangerous, using this rewriting technique.
// console.log(`downloadAsset downloading '${uHref.host}' '${uHref.path}'`);
// The file name to construct is:
//
// /__dlassets/host_name/ENCODED-FN.ext
//
// The host_name is the host from the href, with some characters
// changed to make it safer as a file name
//
// The ENCODED-FN is because for some URLs the path string
// is very complex and decidedly not safe as a file name.
// What we want to do is concatenate every portion of
// the parsed URL which is the path, namely the path,
// the search string, and the hash string. That full path
// is then encoded in BASE58, which is safe for use
// in the file system.
const dlpath_host = uHref.host
? uHref.host.replace('.', '_').replace('.', '_')
: "unknown-host";
// Construct the full path string, then encode it as a string
// which is safe to be used as a file name
const fullpath = uHref.pathname + uHref.search + uHref.hash;
const fnbytes = Buffer.from(fullpath);
const bs58fn = bs58.encode(fnbytes);
const dlDir = path.join('/___dlassets', dlpath_host);
let dirWriteTo;
let dirRenderTo;
if (options && options.cachedir) {
dirWriteTo = path.join(options.cachedir, dlDir);
dirRenderTo = path.join(config.renderDestination, dlDir);
} else {
dirWriteTo = dirRenderTo = path.join(config.renderDestination, dlDir);
// console.log(`downloadAsset NO cachedir pathWriteTo ${pathWriteTo}`);
}
let pathWriteTo;
let pathRenderTo;
if (!uHref.protocol) {
uHref.protocol = 'http';
href = uHref.toString(); // url.format(uHref);
// console.log(`downloadAsset NO PROTOCOL change href to ${href} ${util.inspect(uHref)}`);
}
// console.log(`downloadAsset dlDir ${dlDir} dlpath_host ${dlpath_host} dirWriteTo ${dirWriteTo} dirRenderTo ${dirRenderTo}`);
await fsp.mkdir(dirWriteTo, { recursive: true });
await fsp.mkdir(dirRenderTo, { recursive: true });
// Start the download here
//
// The response object contains the Content-Type from which we get
// the file name extension to use.
const promise = got.get(href, {
responseType: 'buffer',
followRedirect: true
});
const response = await promise;
if (!(
response.complete
&& response.statusCode === 200
&& response.statusMessage === 'OK'
)) {
throw new Error(`downloadAsset FAIL ${response.statusMessage} for ${href} ${util.inspect({
ok: response.ok,
complete: response.complete,
statusCode: response.statusCode,
status: response.status,
statusMessage: response.statusMessage,
headers: response.headers,
contentType: response.headers['content-type'],
url: response.url
})}`);
}
const dlFN = bs58fn.substring(0, 60)
+'.'+ mime.getExtension(response.headers['content-type']);
const dlPath = path.join(dlDir, dlFN);
pathWriteTo = path.join(dirWriteTo, dlFN);
pathRenderTo = path.join(dirRenderTo, dlFN);
/* console.log(`downloadAsset ${href} dlDir ${dlDir} dlPath ${dlPath} `, {
ok: response.ok,
complete: response.complete,
statusCode: response.statusCode,
statusMessage: response.statusMessage,
url: response.url,
headers: response.headers,
contentType: response.headers['content-type'],
}); */
await fsp.writeFile(pathWriteTo, await promise.buffer());
// I tried writing all this by doing a stream from Got
// to a fs.createWriteStream but that didn't work. This
// solution of bringing the response into a Buffer etc is
// less than ideal because of memory use.
// console.log(`downloadAsset ${href} writeFile ${dlPath} => ${pathWriteTo}`);
if (pathWriteTo !== pathRenderTo) {
await fsp.mkdir(path.dirname(pathRenderTo), { recursive: true });
// console.log(`downloadAsset copy ${dlPath} => ${pathRenderTo}`);
await fsp.copyFile(pathWriteTo, pathRenderTo);
}
let ret = { dlPath, pathRenderTo };
hrefsDownloaded.set(href, ret);
return ret;
}
var imgnum = 0;
class ExternalImageDownloader extends Munger {
get selector() { return 'html body img'; }
async process($, $img, metadata, dirty) {
const src = $img.attr('src');
if (!src) return "ok";
// There are various reasons to not download images. For any
// such instance, we simply return rather than proceeding with
// calling downloadAsset
if (typeof $img.prop('nodownload') !== 'undefined') return "ok";
const uHref = new URL(src, 'http://example.com');
if (uHref.host
&& uHref.host === 'www.google.com'
&& uHref.pathname.startsWith('/s2/favicons')) {
// Special case, do not download favicons from Google's favicon service
return "ok";
}
if (uHref.host
&& uHref.host === 'www.plantuml.com'
&& uHref.pathname.startsWith('/plantuml')) {
let ext;
if (uHref.pathname.startsWith('/plantuml/svg')) ext = 'svg';
else if (uHref.pathname.startsWith('/plantuml/png')) ext = 'png';
else throw new Error(`Unknown plantuml image type in ${src}`);
uHref.pathname = `/image${imgnum++}.${ext}`;
}
if (uHref.origin !== 'http://example.com' ) {
// Not a Local URL
try {
const { dlPath, pathWriteTo } = await downloadAsset(
this.config, this.options, src, uHref, 'binary');
$img.attr('src', dlPath);
$img.attr('data-orig-src', src);
// console.log(`ExternalImageDownloader ${src} ==> ${dlPath}`);
} catch (e) {
console.log(`IGNORE ERROR akashacms-dlassets ExternalImageDownloader for URL ${src}: ${e.stack}`);
$img.attr('src', src);
}
}
return "ok";
}
}
class ExternalStylesheetDownloader extends Munger {
get selector() { return 'html head link'; }
async process($, $link, metadata, dirty) {
const type = $link.attr('type');
const href = $link.attr('href');
if (!href) return "ok";
if (type !== 'text/css') return "ok";
const uHref = new URL(href, 'http://example.com');
if (uHref.origin !== 'http://example.com') {
try {
const { dlPath, pathWriteTo } = await downloadAsset(
this.config, this.options, href, uHref, 'utf8');
$link.attr('href', dlPath);
$link.attr('data-orig-href', href);
// console.log(`ExternalStylesheetDownloader ${src} ==> ${dlPath}`);
} catch (e) {
console.log(`IGNORE ERROR akashacms-dlassets ExternalStylesheetDownloader for URL ${href}: ${e.stack}`);
$link.attr('href', href);
}
}
}
}
class ExternalJavaScriptDownloader extends Munger {
get selector() { return 'html head script'; }
async process($, $script, metadata, dirty) {
const src = $script.attr('src');
if (!src) return "ok";
const uHref = new URL(src, 'http://example.com')
if (uHref.origin !== 'http://example.com') {
try {
const { dlPath, pathWriteTo } = await downloadAsset(
this.config, this.options, src, uHref, 'utf8');
$script.attr('src', dlPath);
$script.attr('data-orig-src', src);
// console.log(`ExternalJavaScriptDownloader ${src} ==> ${dlPath}`);
} catch (e) {
console.log(`IGNORE ERROR akashacms-dlassets ExternalJavaScriptDownloader for URL ${src}: ${e.stack}`);
$script.attr('src', src);
}
}
return "ok";
}
}