website-scrap-engine
Version:
Configurable website scraper in typescript
73 lines (69 loc) • 2 kB
text/typescript
import {ResourceType} from '../resource.js';
import {arrayToMap, isSiteMap} from '../util.js';
// immutable
export const binaryExtension = arrayToMap([
'gif', 'jpg', 'jpeg', 'png',
'js', 'jsm', 'json', 'txt',
'woff2', 'ttf', 'ttc',
'xul',
'jar', 'zip', 'rar', '7z', 'tar', 'gz', 'bz2', 'xz',
'mp3', 'ogg',
'mp4', 'flv', 'm4v', 'mkv', 'webm',
'msi',
'xpi',
'rdf',
'pdf',
'dia',
'eot',
'psd'
], true);
/**
* Return the extension of the path or the url,
* from the last '.' to end of string in the last portion of the path.
* If there is no '.' in the last portion of the path,
* then it returns an empty string.
*
* @see path.extname
* @param url the url to evaluate.
*/
export function lowerCaseExtension(url: string): string | void {
const hashIndex: number = url.lastIndexOf('#');
const searchIndex: number = hashIndex === -1 ?
url.lastIndexOf('?') :
url.lastIndexOf('?', hashIndex);
const endIndex: number = searchIndex === -1 ?
hashIndex :
hashIndex === -1 ? searchIndex : Math.min(searchIndex, hashIndex);
const endPath: number = endIndex === -1 ?
url.lastIndexOf('/') :
url.lastIndexOf('/', endIndex);
const lastIndex: number = endIndex === -1 ?
url.lastIndexOf('.') :
url.lastIndexOf('.', endIndex);
if (lastIndex !== -1 && lastIndex > endPath) {
return endIndex === -1 ?
url.slice(lastIndex + 1).toLowerCase() :
url.slice(lastIndex + 1, endIndex).toLowerCase();
}
}
export function detectResourceType(
url: string,
type: ResourceType
): ResourceType {
if (isSiteMap(url)) {
return ResourceType.SiteMap;
}
if (type === ResourceType.Html) {
const extension: string | void = lowerCaseExtension(url);
if (extension) {
if (binaryExtension[extension]) {
return ResourceType.Binary;
} else if ('css' === extension) {
return ResourceType.Css;
} else if ('svg' === extension) {
return ResourceType.Svg;
}
}
}
return type;
}