website-scrap-engine
Version:
Configurable website scraper in typescript
69 lines • 2.12 kB
JavaScript
import { ResourceType } from '../resource.js';
import { arrayToMap, isSiteMap } from '../util.js';
// immutable
export const binaryExtension = arrayToMap([
'gif', 'jpg', 'jpeg', 'png',
'js', 'jsm', 'json', 'txt',
'woff2', 'ttf', 'ttc',
'xul',
'jar', 'zip', 'rar', '7z', 'tar', 'gz', 'bz2', 'xz',
'mp3', 'ogg',
'mp4', 'flv', 'm4v', 'mkv', 'webm',
'msi',
'xpi',
'rdf',
'pdf',
'dia',
'eot',
'psd'
], true);
/**
* Return the extension of the path or the url,
* from the last '.' to end of string in the last portion of the path.
* If there is no '.' in the last portion of the path,
* then it returns an empty string.
*
* @see path.extname
* @param url the url to evaluate.
*/
export function lowerCaseExtension(url) {
const hashIndex = url.lastIndexOf('#');
const searchIndex = hashIndex === -1 ?
url.lastIndexOf('?') :
url.lastIndexOf('?', hashIndex);
const endIndex = searchIndex === -1 ?
hashIndex :
hashIndex === -1 ? searchIndex : Math.min(searchIndex, hashIndex);
const endPath = endIndex === -1 ?
url.lastIndexOf('/') :
url.lastIndexOf('/', endIndex);
const lastIndex = endIndex === -1 ?
url.lastIndexOf('.') :
url.lastIndexOf('.', endIndex);
if (lastIndex !== -1 && lastIndex > endPath) {
return endIndex === -1 ?
url.slice(lastIndex + 1).toLowerCase() :
url.slice(lastIndex + 1, endIndex).toLowerCase();
}
}
export function detectResourceType(url, type) {
if (isSiteMap(url)) {
return ResourceType.SiteMap;
}
if (type === ResourceType.Html) {
const extension = lowerCaseExtension(url);
if (extension) {
if (binaryExtension[extension]) {
return ResourceType.Binary;
}
else if ('css' === extension) {
return ResourceType.Css;
}
else if ('svg' === extension) {
return ResourceType.Svg;
}
}
}
return type;
}
//# sourceMappingURL=detect-resource-type.js.map