UNPKG

unblocker

Version:

Web proxy for evading internet censorship & general-purpose library for rewriting remote websites.

192 lines (170 loc) 6.56 kB
"use strict"; var debug = require("debug")("unblocker:charsets"); var Transform = require("stream").Transform; var PassThrough = require("stream").PassThrough; var iconv = require("iconv-lite"); var contentTypes = require("./content-types.js"); // content-types that might possibly have the charset in a meta tag function mayContainMeta(type) { var types = ["text/html", "application/xml+xhtml", "application/xhtml+xml"]; return types.indexOf(type) != -1; } function charsets(config) { function decodeCharset(data) { if (contentTypes.shouldProcess(config, data)) { var charset = contentTypes.getCharset(data); if (iconv.encodingExists(charset)) { // happy case, we know the encoding right away, so we can just return decode/recode streams data.charset = charset; data.stream = data.stream.pipe(iconv.decodeStream(charset)); debug("decoding %s charset via iconv stream", charset); } else if (mayContainMeta(data.contentType)) { debug("decoding unknown charset via iconv html stream"); data.charsetDecoder = new IconvHtmlStream(); data.charsetDecoder.on("charset", function (charset) { // note: while the recode stream will accept content before this and just output utf-8, it shouldn't actually receive any data because the decode stream buffers until *after* this event data.charset = charset; }); data.stream = data.stream.pipe(data.charsetDecoder); } else { debug("no charset info available, assuming utf8"); // semi-happy case. we know the content needs parsed but have no way of knowing it's charset. Hopefully .toString() will be good enough. No recoding data.stream = data.stream.pipe( new PassThrough({ encoding: "utf8", }) ); } // in all cases, we output utf8, so we want to make sure any headers and meta tags match that contentTypes.setHeader(data); data.stream = data.stream.pipe(new MetaCharsetReplacerStream()); } } return decodeCharset; } module.exports = charsets; // based on https://github.com/ashtuchkin/iconv-lite/blob/master/lib/streams.js var re_charset_finder = /<\?xml[^>]+encoding="([^">]+)"|<meta [^>]*charset=['"]?([^ '">]+)['"]/i; // warning: making this global causes it to not include the matched value in the results :/ // == Decoder stream ======================================================= function IconvHtmlStream(options) { this.buff = Buffer.alloc(0); this.isBuffering = true; this.conv = null; options = options || {}; this.rewrite = options.rewrite !== false; this.inputEncoding = "utf8"; this.encoding = options.encoding = "utf8"; // this is the *output* encoding this.conv = iconv.getEncoder(this.inputEncoding); Transform.call(this, options); } IconvHtmlStream.prototype = Object.create(Transform.prototype, { constructor: { value: IconvHtmlStream, }, }); IconvHtmlStream.prototype._transform = function (chunk, encoding, done) { if (!Buffer.isBuffer(chunk)) return done( new Error("delayed decoding stream needs buffers as its input.") ); if (this.isBuffering) { this.bufferAndTest(chunk, encoding, done); } else { this.stream(chunk, encoding, done); } }; IconvHtmlStream.prototype.stream = function (chunk, encoding, done) { try { var res = this.conv.write(chunk); if (res && res.length) this.push(res, this.encoding); done(); } catch (e) { done(e); } }; IconvHtmlStream.prototype.bufferAndTest = function (chunk, encoding, done) { this.buff = Buffer.concat([this.buff, chunk]); var str = this.buff.toString(); var charsetMatch = str.match(re_charset_finder); // extract the charset from a meta tag or the opening <?xml tag var endOfHead = str.match(/<\/head>/); // todo: consider matching on some other tags such as |<div |<span <a | to avoid buffering entire html snippets if (charsetMatch) { this.startStreaming(charsetMatch[1] || charsetMatch[2], encoding, done); } else if (endOfHead) { // go with the safest guess for the charset // todo: try using something like https://www.npmjs.com/package/detect-character-encoding here (although probably not that one specifically since it doesn't work on windows or 32-bit *nix) this.startStreaming("utf8", encoding, done); } else { debug("buffering"); // otherwise just buffer the chunk. Call done() to ensure that we get the next one. done(); } }; IconvHtmlStream.prototype.startStreaming = function (charset, encoding, done) { // setup the decoder if (iconv.encodingExists(charset)) { this.inputEncoding = charset; this.conv = iconv.getDecoder(this.inputEncoding); } else { console.error( "unrecognized charset %s, decoding as utf8", this.inputEncoding ); } this.emit("charset", this.inputEncoding); this.isBuffering = false; // decode and forward our existing buffer this.stream(this.buff, encoding, done); // cleanup to ensure _flush doesn't accidentally send data twice this.buff = null; }; IconvHtmlStream.prototype._flush = function (done) { var res; try { if (this.buff) { res = this.conv.write(this.buff); if (res && res.length) this.push(res, this.encoding); this.buff = null; } res = this.conv.end(); if (res && res.length) this.push(res, this.encoding); done(); } catch (e) { done(e); } }; var re_charset_replacer = /<\?xml[^>]+encoding="([^">]+)"|<meta [^>]*charset=['"]?([^ '">]+)['"]/gi; // similar to the charset_finder, except global function MetaCharsetReplacerStream(options) { options = options || {}; this.encoding = options.encoding = "utf8"; // this is the *output* encoding options.decodeStrings = false; // don't turn my strings back into a buffer! Transform.call(this, options); } MetaCharsetReplacerStream.prototype = Object.create(Transform.prototype, { constructor: { value: MetaCharsetReplacerStream, }, }); MetaCharsetReplacerStream.prototype._transform = function ( chunk, encoding, done ) { done( null, chunk .toString() .replace( re_charset_replacer, function (subChunk, xmlCharset, metaCharset) { var oldCharset = xmlCharset || metaCharset; var newSubChunk = subChunk.replace(oldCharset, "UTF-8"); debug( "rewriting charset meta tag from %s to %s", subChunk, newSubChunk ); return newSubChunk; } ) ); };