url-metadata
Version:
Request a url and scrape the metadata from its HTML using Node.js or the browser.
140 lines (126 loc) • 4.78 kB
JavaScript
const extractCharset = require('./lib/extract-charset')
const parse = require('./lib/parse')
module.exports = function (url, options, _fetch, useAgent) {
if (!options || typeof options !== 'object') options = {}
const opts = Object.assign(
// defaults
{
requestHeaders: {
'User-Agent': 'url-metadata (+https://www.npmjs.com/package/url-metadata)',
From: 'example@example.com'
},
requestFilteringAgentOptions: undefined, // Node.js v18+ only, silently ignored by others
agent: undefined, // Node.js v6+ only; silently ignored by others
cache: 'no-cache', // Browser only
mode: 'cors', // Browser only
maxRedirects: 10,
timeout: 10000,
size: 0, // Node.js v6+ only; silently ignored by others
compress: true, // Node.js v6+ only; silently ignored by others
decode: 'auto',
descriptionLength: 750,
ensureSecureImageRequest: true,
includeResponseBody: false,
parseResponseObject: undefined
},
// user options override defaults
options
)
let requestUrl = ''
let destinationUrl = ''
let contentType
let charset
let currentResponse = null
async function fetchData (_url, redirectCount = 0) {
if (redirectCount > opts.maxRedirects) {
throw new Error('too many redirects')
}
if (!_url && opts.parseResponseObject) {
return opts.parseResponseObject
} else if (_url) {
requestUrl = url
const requestOpts = {
method: 'GET',
headers: opts.requestHeaders,
agent: opts.agent || useAgent(url, opts.requestFilteringAgentOptions),
cache: opts.cache,
mode: opts.mode,
redirect: 'manual',
timeout: opts.timeout,
size: opts.size,
compress: opts.compress
}
// Make the fetch request
const response = await _fetch(_url, requestOpts)
if (response.status >= 300 && response.status < 400 && response.headers.get('location')) {
const newUrl = new URL(response.headers.get('location'), url).href
return fetchData(newUrl, redirectCount + 1)
}
return response
} else if (!_url) {
throw new Error('url parameter is missing')
}
}
return new Promise((resolve, reject) => {
fetchData(url)
.then((response) => {
if (!response) {
return reject(new Error(`response is ${typeof response}`))
}
if (!response.ok) {
return reject(new Error(`response code ${response.status}`))
}
// Set `currentResponse` in case of error
currentResponse = response
// disambiguate `requestUrl` from final destination url
// (ex: links shortened by bit.ly)
if (response.url) destinationUrl = response.url
// validate response content type
contentType = response.headers.get('content-type')
const isText = contentType && contentType.startsWith('text')
const isHTML = contentType && contentType.includes('html')
if (!isText || !isHTML) {
return reject(new Error(`unsupported content type: ${contentType}`))
}
return response.arrayBuffer()
})
.then(async (responseBuffer) => {
if (!responseBuffer) return
// handle optional user-specified charset
if (opts.decode !== 'auto') {
charset = opts.decode
// extract charset in opts.decode='auto' mode
} else {
charset = extractCharset(contentType, responseBuffer)
}
try {
// decode with charset
const decoder = new TextDecoder(charset)
const responseDecoded = decoder.decode(responseBuffer)
// now parse the metadata!
resolve(parse(
requestUrl,
destinationUrl,
responseDecoded,
currentResponse.headers,
opts
))
} catch (e) {
return reject(new Error(`decoding with charset: ${charset}`))
}
})
.catch(error => {
// Cleanup resources to avoid memory leaks
if (currentResponse && currentResponse.body) {
// Destroy the body stream `node-fetch` uses to force-close the connection
if (typeof currentResponse.body.destroy === 'function') currentResponse.body.destroy()
// Modern browsers and Node.js 18+ have cancel() on the ReadableStream
else if (typeof currentResponse.body.cancel === 'function') currentResponse.body.cancel().catch(() => {})
// Fallback: consume the stream to close the connection
else if (typeof currentResponse.text === 'function') currentResponse.text().catch(() => {})
}
// Finally, reject
return reject(error)
})
})
}