UNPKG

url-metadata

Version:

Request a url and scrape the metadata from its HTML using Node.js or the browser.

42 lines (35 loc) 1.55 kB
module.exports = function (contentTypeHeader, responseBuffer) { // default to `utf-8` per html5 spec let charset = 'utf-8' // First, try extracting charset from `Content-Type` response header // Per spec, charset in header should take precedent // ex: "Content-Type": "text/html; charset=utf-8" if (contentTypeHeader) { const headerMatch = contentTypeHeader.match(/charset=([\w-]+)/) if (headerMatch && headerMatch[1]) { charset = headerMatch[1] return charset } } // If no charset found, try extracting charset from meta tags in html body // Convert the responseBuffer to a string temporarily using `utf-8` // Then use regex to get charset from meta tags const decoder = new TextDecoder('utf-8') const text = decoder.decode(responseBuffer) // <meta charset="windows-1251"> const charsetMetaRegex = /<meta\s+charset=['"]?([^\s'"]+)['"]?\s*\/?>/i const charsetMetaMatch = text.match(charsetMetaRegex) if (charsetMetaMatch && charsetMetaMatch[1]) { charset = charsetMetaMatch[1] return charset } // <meta http-equiv="Content-Type" content="text/html; charset=windows-1251" /> const httpEquivCharsetRegex = /<meta\s+[^>]*\bhttp-equiv\s*=\s*["']?Content-Type["']?[^>]*\bcharset\s*=\s*["']?([^'"\s>]+)["']?[^>]*>/i const httpEquivCharsetMatch = text.match(httpEquivCharsetRegex) if (httpEquivCharsetMatch && httpEquivCharsetMatch[1]) { charset = httpEquivCharsetMatch[1] return charset } // return default `utf-8` if none of the above found return charset }