spider8831
Version:
A simple 88x31 banner scanner
182 lines (167 loc) • 5.64 kB
JavaScript
import { JSDOM } from "jsdom";
import { imageSize } from "image-size";
/**
* @typedef {object} Spider8831Options
* @property {number} [width]
* @property {number} [height]
* @property {RegExp | (url: string) => boolean | false} [follow]
* @property {number} [depth]
* @property {number} [timeout]
*/
/**
* @typedef {object} Spider8831CallbackOpts
* @property {"image" | "link" | "error" | "timeout"} type
* @property {string} url
*/
/**
* @typedef {object} Spider8831Link
* @property {string} url
* @property {number} depth
*/
/**
* @typedef {object} Spider8831Image
* @property {string} url
* @property {Buffer} img
* @property {string} [link]
*/
/**
* @typedef {object} Spider8831Return
* @property {Spider8831Link[]} next
* @property {Spider8831Image[]} imgs
*/
const wait = ms => new Promise(resolve => setTimeout(resolve, ms));
export default class Spider8831 {
static defaultFollow = /^https?:\/\/.+\.neocities\.org/;
/**
* @param {string} url
* @returns {boolean}
*/
static imgURL(url) {
const urlObj = new URL(url);
return urlObj.pathname.match(/\.(png|jpe?g|gif|webp|bmp)$/);
}
/**
* @param {Spider8831Options} options
*/
constructor(options) {
this.width = options.width ?? 88;
this.height = options.height ?? 31;
this.follow = options.follow ?? (() => Spider8831.defaultFollow);
this.depth = options.depth ?? 5;
this.timeout = options.timeout ?? 30000;
/** @type {string[]} */
this.visited = [];
}
/**
* @param {string | URL} url
* @returns {Promise<JSDOM>}
*/
static async fetchJSDOM(url) {
const f = await fetch(url);
const t = await f.text();
const dom = new JSDOM(t, { contentType: "text/html" });
return dom;
}
/**
* @param {string | URL} url
* @returns {Promise<Buffer>}
*/
static async fetchImage(url) {
const f = await fetch(url);
const b = Buffer.from(await f.arrayBuffer());
return b;
}
/**
* @param {Buffer} buf
* @returns {[number, number]}
*/
static getImageSize(buf) {
const dim = imageSize(buf);
return [dim.width, dim.height];
}
/**
* @param {string | URL}
* @returns {string}
*/
static getURL(url) {
if(url instanceof URL) url = url.href;
return url;
}
/**
* @param {string}
* @returns {boolean}
*/
checkURL(url) {
return !this.visited.includes(url)
&& (this.follow instanceof RegExp ? !!url.match(this.follow) : this.follow(url));
}
/**
* Scans a single page recursively.
* @param {string | URL} url
* @param {number} [depth]
* @param {(opts: Spider8831CallbackOpts) => void} [cb]
* @returns {Promise<Spider8831Return>}
*/
async scan(url, depth = 0, cb = (_opts) => {}) {
if(depth >= this.depth) return { next: [], imgs: [] };
url = Spider8831.getURL(url);
this.visited.push(url);
let timedOut = false;
/** @type {Spider8831Link[]} */
const next = [];
/** @type {Spider8831Image[]} */
const imgs = [];
await Promise.race([
(async () => {
await wait(this.timeout);
timedOut = true;
cb?.({ type: "timeout", url });
})(),
(async () => {
/** @type {JSDOM} */
let dom;
try {
dom = await Spider8831.fetchJSDOM(url);
} catch(_) {
cb?.({ type: "error", url: url });
return { next: [], imgs: [] };
}
if(timedOut) return;
const { document } = dom.window;
const links = Array.from(document.querySelectorAll("a"));
const images = Array.from(document.querySelectorAll("img"));
for(const link of links) {
let href = link.href;
if(!href) continue;
href = new URL(href, new URL(url).origin).href;
if(!this.checkURL(href)) continue;
cb?.({ type: "link", url: href });
next.push({ depth: depth + 1, url: href });
}
for(const img of images) {
let src = img.src;
if(!src) continue;
src = new URL(src, new URL(url).origin).href;
this.visited.push(src);
let link = null;
if(img.parentElement && img.parentElement.tagName.toLowerCase() === "a" && img.parentElement.href)
link = new URL(img.parentElement.href, new URL(url).origin).href;
try {
const img = await Spider8831.fetchImage(src);
if(timedOut) return;
const size = Spider8831.getImageSize(img);
if(size[0] !== this.width || size[1] !== this.height) continue;
cb?.({ type: "image", url: src });
/** @type {Spider8831Image} */
const res = { img, url: src };
if(link !== null) res.link = link;
imgs.push(res);
} catch(_) {
cb?.({ type: "error", url: src });
}
}
})()
]);
return { imgs, next };
}
}