UNPKG

contract-scraper

Version:

A customisable data scraper for the web based on JSON contracts

66 lines (65 loc) 2.54 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.getContentTypeHeaders = getContentTypeHeaders; exports.encodePageContents = encodePageContents; exports.guessEncoding = guessEncoding; const jschardet = __importStar(require("jschardet")); const cheerio = __importStar(require("cheerio")); const Iconv = __importStar(require("iconv")); function getCharsetFromContentType(contentType) { const regex = /(?<=charset=)[^;]*/gm; const charset = regex.exec(contentType); if (charset === null) return; return charset[0]; } function getContentTypeFromHTML(contents) { const $ = cheerio.load(contents); return $('meta[charset]').attr('charset'); } function getContentTypeHeaders(headers) { return headers['content-type'] || headers['Content-type'] || headers['Content-Type']; } function encodePageContents(encoding, contents) { const lib = Iconv; const converter = lib['Iconv']; const iconv = new converter(encoding, 'UTF-8//IGNORE//TRANSLIT'); if (encoding.toLowerCase().includes('windows-')) { return contents; } return iconv.convert(contents).toString('utf-8'); } function guessEncoding(contentType, contents) { const headerCharset = getCharsetFromContentType(contentType); if (headerCharset) { return headerCharset; } const metaCharset = getContentTypeFromHTML(contents); if (metaCharset) { return metaCharset; } return jschardet.detect(Buffer.from(contents)).encoding; }