UNPKG

getline

Version:

Classes for reading line-terminated data in files

412 lines (368 loc) 14.2 kB
/* Copyright (c) Chris Vine, 2011 * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ var fs = require("fs"); var events = require("events"); // if line length exceeds or equals BUF_SIZE, then the file will be // treated as a binary file and next() will throw an Error var BUF_SIZE = 8092; // *** GetlineSync class *** // The GetlineSync class uses synchronous open and read functions, // which won't block on a file on a regular file system, and is useful // for such things as reading configuration files on setting up a // program. For other uses, GetlineAsync will usually fit better into // node.js's event based approach to program design, and is // particularly important where reading might block the event loop // because the file is, say, a file on a networked file system. The // constructor will throw an exception if the file name passed in // doesn't exist or cannot be opened for some other reason. // // The class's next() method will cause a line of the file to be // returned as a string (for which purpose it is assumed that the file // is in utf8/ascii encoding). Further calls to next() will advance // through the file, line by line. The terminating '\n' character is // omitted. The string "EOF" will be thrown by next() after // end-of-file has been reached and the last line has been extracted, // so that iteration can stop. // // The underlying file descriptor will be closed and the buffer // released on end-of-file being reached or an error being thrown. If // the user finishes iterating before either of those events, // GetlineSync's close() method can be called. // // The only public methods that GetlineSync has are next() and // close(). exports.GetlineSync = GetlineSync; function GetlineSync(filename) { try { this._fd = fs.openSync(filename, "r"); } catch (e) { this._buffer = null; this._eof = false; // make exception message more intelligible throw new Error("The following file cannot be opened for reading: " + filename); } this._buffer = new Buffer(BUF_SIZE); this._end = 0; // after any get, one past last loaded element in buffer this._pos = 0; // after any get, one past last '\n' found, or one // past last byte in file this._eof = false; } GetlineSync.prototype._refill = function () { // first move any part line at the end of the buffer to the front // of the buffer var i = 0; while (this._pos < this._end) { this._buffer[i++] = this._buffer[this._pos++]; } this._pos = 0; this._end = i; // now refill the remainder of the buffer (fs.readSync() returns // the number of bytes read: 0 means EOF) var res = fs.readSync(this._fd, // file descriptor this._buffer, // byte buffer this._end, // buffer position of start of load BUF_SIZE - this._end, // max length of load null); // OS file position: null = current pos this._end += res; return res; } // returns position in buffer of next '\n' byte in buffer, or // this._end if none found. It does not mutate any object data GetlineSync.prototype._find_eol = function () { var i = this._pos; for (; i < this._end; ++i) { if (this._buffer[i] === 0xA) { // '\n' break; } } return i; } GetlineSync.prototype.next = function () { // check pre-conditions if (this._eof) { this.close(); throw "EOF"; } if (this._buffer === null) throw new Error("GetlineSync file is closed"); var eol = this._find_eol(); // unix read(), and so _refill(), can return less than the number // of bytes requested, so use a while loop and loop on _refill() // as necessary (possibly node guarantees a full read, but that is // not specified in the documentation and it is painless to use a // 'while' block rather than an 'if' block in case it doesn't) while (eol === this._end) { if (this._pos === 0 && this._end === BUF_SIZE) { // line length has exceeded or equalled BUF_SIZE this.close(); throw new Error("Binary file detected: if this really is a line structured " + "file, consider increasing BUF_SIZE"); } var fetched; try { fetched = this._refill(); } catch (e) { this.close(); throw e; } if (fetched === 0) { // end-of-file // _refill() can reset _pos and _end, so we need to reset // eol even though fetched === 0 eol = this._end; this._eof = true; break; } eol = this._find_eol(); } var begin = this._pos; // don't return an empty string representing the phantom output of // the last line of the file with concluding '\n' if (this._eof === true && begin === eol) { this.close(); throw "EOF"; } this._pos = eol; // swallow '\n' for next read: we don't need to check whether // this._pos < this._end, because that could only happen if we // have reached end-of-file with no terminating '\n', and any // further attempts to call this method will throw "EOF" in case // of end-of-file ++this._pos; // also cater for DOS style line endings if (eol && this._buffer[eol - 1] === 0xD) // '\r' --eol; return this._buffer.toString("utf8", begin, eol); } GetlineSync.prototype.close = function () { if (this._buffer !== null) { try {fs.closeSync(this._fd);} catch (e) {} this._buffer = null; } } // *** GetlineAsync class *** // The GetlineAsync class is an asynchronous version of GetlineSync, // and often fits better into node.js's event based approach to // program design than GetlineSync. The constructor of this class // takes a file descriptor, not a filename, otherwise a user may be // presented with a partially constructed object if implemented with // asynchronous fs.open(). Use fs.open()/fs.openSync() to obtain a // file descriptor to pass in, as appropriate to the usage case. // // As in the case of GetlineSync, the file concerned must be in utf8 // or ascii encoding. // // Like GetlineSync, GetlineAsync has a next() method to iterate // through the file line by line. However, next() does not return // anything. Instead, GetlineAsync derives from EmitterEvent, and // emits "line" following a call to next(), once the line has been // fully received and assembled. The "line" event callback is passed // three arguments: first an Error value (which will be null if there // is no Error), secondly a boolean value indicating whether // end-of-file has been reached, as end-of-file is not an error (in // which case the third argument can be ignored, as it will be an // empty string), and thirdly a string containing the line extracted, // without terminating '\n' // // After the initial call to the next() method of GetlineAsync, // further calls to that method should only be made once the "line" // event callback for the previous call has begun executing: in other // words, further calls should be made within the "line" event // callback function or, say, in a callback chained to asynchronous // calls made by that function. // // The underlying file descriptor will be closed and the buffer // released on end-of-file being reached or an error arising and, to // release other resources, any "line" event callbacks will // automatically be removed once relevant emissions have been made // informing listeners of the end-of-file or error. If the user // finishes iterating before either of those events, GetlineAsync's // close() method can be called to achieve the same result. // // The only public methods that GetlineAsync has are next() and // close(). exports.GetlineAsync = GetlineAsync; function GetlineAsync(fd) { events.EventEmitter.call(this); this._fd = fd; this._buffer = new Buffer(BUF_SIZE); this._end = 0; // after any get, one past last loaded element in buffer this._pos = 0; // after any get, one past last '\n' found, or one // past last byte in file this._eof = false; this._reading = false; } // this does the same as util.inherits, except that // GetlineAsync.prototype.constructor is enumerable on the prototype // object, which I prefer in these cases, and this approach also makes // it more obvious what is happening, which makes me happier GetlineAsync.prototype = Object.create(events.EventEmitter.prototype); GetlineAsync.prototype.constructor = GetlineAsync; // the callback is passed to fs.read(), and has three arguments GetlineAsync.prototype._refill = function (cb) { // first move any part line at the end of the buffer to the front // of the buffer var i = 0; while (this._pos < this._end) { this._buffer[i++] = this._buffer[this._pos++]; } this._pos = 0; this._end = i; // now refill the remainder of the buffer fs.read(this._fd, // file descriptor this._buffer, // byte buffer this._end, // buffer position of start of load BUF_SIZE - this._end, // max length of load null, // OS file position: null = current pos cb); } // returns position in buffer of next '\n' byte in buffer, or // this._end if none found. It does not mutate any object data GetlineAsync.prototype._find_eol = function () { var i = this._pos; for (; i < this._end; ++i) { if (this._buffer[i] === 0xA) { // '\n' break; } } return i; } GetlineAsync.prototype._emit_line = function (eol) { var begin = this._pos; this._reading = false; // don't return an empty string representing the phantom output of // the last line of the file with concluding '\n' if (this._eof === true && begin === eol) { this.emit("line", null, true, ""); this.close(); } else { this._pos = eol; // swallow '\n' for next read: we don't need to check whether // this._pos < this._end, because that could only happen if we // have reached end-of-file with no terminating '\n', and any // further attempts to call next() after end-of-file will only // cause an end-of-file argument to be emitted ++this._pos; // also cater for DOS style line endings if (eol && this._buffer[eol - 1] === 0xD) // '\r' --eol; // Emit via process.nextTick() to avoid excessive recursion on // next() calls where asynchronous _refill() isn't called var self = this; var line = this._buffer.toString("utf8", begin, eol); process.nextTick(function () { self.emit("line", null, false, line); }); } } GetlineAsync.prototype.next = function () { // check pre-conditions if (this._reading) { // next() has been called outside the "line" event callback this.emit("line", new Error("GetlineAsync read operation already in course"), false, ""); this.close(); // also sets _reading to false return; } if (this._eof) { this.emit("line", null, true, ""); this.close(); return; } if (this._buffer === null) { this.emit("line", new Error("GetlineAsync file is closed"), false, ""); return; } this._reading = true; // unix read(), and so _refill(), can return less than the number // of bytes requested, so loop on _refill() as necessary (possibly // node guarantees a full read, but that is not specified in the // documentation so call a nested loop function in case it // doesn't) var self = this; function loop () { var eol = self._find_eol(); if (eol === self._end) { if (self._pos === 0 && self._end === BUF_SIZE) { // line length has exceeded or equalled BUF_SIZE self.emit("line", new Error("Binary file detected: if this really is a line structured " + "file, consider increasing BUF_SIZE"), false, ""); self.close(); } else { self._refill(function (err, fetched) { if (err) { self.emit("line", err, false, ""); self.close(); } // check whether we still hold our async guard and _buffer still // exists, because if the user has called next() incorrectly so // the first pre-condition above has not been met, an error will // have been propagated and the buffer released (and similarly if // close() has been called outside a "line" event callback) else if (self._reading) { if (fetched === 0) { // end-of-file self._eof = true; // as we have called _refill(), which resets _pos and _end, // eol may no longer hold the same value as _end, so just pass // _end instead of calling _find_eol() again (which would give // the same result less efficiently) self._emit_line(self._end); } else { self._end += fetched; loop(); // loop to see if we have read enough bytes to find eol } } }); } } else { self._emit_line(eol); } } // end of loop() function definition loop(); // initiate loop } GetlineAsync.prototype.close = function () { if (this._buffer !== null) { fs.close(this._fd); this._buffer = null; this._reading = false; // go via event loop to allow a set of error/eof messages // before disconnecting listeners var self = this; process.nextTick(function () {self.removeAllListeners();}); } }