contract-scraper
Version:
A customisable data scraper for the web based on JSON contracts
66 lines (65 loc) • 2.54 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getContentTypeHeaders = getContentTypeHeaders;
exports.encodePageContents = encodePageContents;
exports.guessEncoding = guessEncoding;
const jschardet = __importStar(require("jschardet"));
const cheerio = __importStar(require("cheerio"));
const Iconv = __importStar(require("iconv"));
function getCharsetFromContentType(contentType) {
const regex = /(?<=charset=)[^;]*/gm;
const charset = regex.exec(contentType);
if (charset === null)
return;
return charset[0];
}
function getContentTypeFromHTML(contents) {
const $ = cheerio.load(contents);
return $('meta[charset]').attr('charset');
}
function getContentTypeHeaders(headers) {
return headers['content-type'] || headers['Content-type'] || headers['Content-Type'];
}
function encodePageContents(encoding, contents) {
const lib = Iconv;
const converter = lib['Iconv'];
const iconv = new converter(encoding, 'UTF-8//IGNORE//TRANSLIT');
if (encoding.toLowerCase().includes('windows-')) {
return contents;
}
return iconv.convert(contents).toString('utf-8');
}
function guessEncoding(contentType, contents) {
const headerCharset = getCharsetFromContentType(contentType);
if (headerCharset) {
return headerCharset;
}
const metaCharset = getContentTypeFromHTML(contents);
if (metaCharset) {
return metaCharset;
}
return jschardet.detect(Buffer.from(contents)).encoding;
}