getline
Version:
Classes for reading line-terminated data in files
412 lines (368 loc) • 14.2 kB
JavaScript
/* Copyright (c) Chris Vine, 2011
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
var fs = require("fs");
var events = require("events");
// if line length exceeds or equals BUF_SIZE, then the file will be
// treated as a binary file and next() will throw an Error
var BUF_SIZE = 8092;
// *** GetlineSync class ***
// The GetlineSync class uses synchronous open and read functions,
// which won't block on a file on a regular file system, and is useful
// for such things as reading configuration files on setting up a
// program. For other uses, GetlineAsync will usually fit better into
// node.js's event based approach to program design, and is
// particularly important where reading might block the event loop
// because the file is, say, a file on a networked file system. The
// constructor will throw an exception if the file name passed in
// doesn't exist or cannot be opened for some other reason.
//
// The class's next() method will cause a line of the file to be
// returned as a string (for which purpose it is assumed that the file
// is in utf8/ascii encoding). Further calls to next() will advance
// through the file, line by line. The terminating '\n' character is
// omitted. The string "EOF" will be thrown by next() after
// end-of-file has been reached and the last line has been extracted,
// so that iteration can stop.
//
// The underlying file descriptor will be closed and the buffer
// released on end-of-file being reached or an error being thrown. If
// the user finishes iterating before either of those events,
// GetlineSync's close() method can be called.
//
// The only public methods that GetlineSync has are next() and
// close().
exports.GetlineSync = GetlineSync;
function GetlineSync(filename) {
try {
this._fd = fs.openSync(filename, "r");
}
catch (e) {
this._buffer = null;
this._eof = false;
// make exception message more intelligible
throw new Error("The following file cannot be opened for reading: "
+ filename);
}
this._buffer = new Buffer(BUF_SIZE);
this._end = 0; // after any get, one past last loaded element in buffer
this._pos = 0; // after any get, one past last '\n' found, or one
// past last byte in file
this._eof = false;
}
GetlineSync.prototype._refill = function () {
// first move any part line at the end of the buffer to the front
// of the buffer
var i = 0;
while (this._pos < this._end) {
this._buffer[i++] = this._buffer[this._pos++];
}
this._pos = 0;
this._end = i;
// now refill the remainder of the buffer (fs.readSync() returns
// the number of bytes read: 0 means EOF)
var res = fs.readSync(this._fd, // file descriptor
this._buffer, // byte buffer
this._end, // buffer position of start of load
BUF_SIZE - this._end, // max length of load
null); // OS file position: null = current pos
this._end += res;
return res;
}
// returns position in buffer of next '\n' byte in buffer, or
// this._end if none found. It does not mutate any object data
GetlineSync.prototype._find_eol = function () {
var i = this._pos;
for (; i < this._end; ++i) {
if (this._buffer[i] === 0xA) { // '\n'
break;
}
}
return i;
}
GetlineSync.prototype.next = function () {
// check pre-conditions
if (this._eof) {
this.close();
throw "EOF";
}
if (this._buffer === null)
throw new Error("GetlineSync file is closed");
var eol = this._find_eol();
// unix read(), and so _refill(), can return less than the number
// of bytes requested, so use a while loop and loop on _refill()
// as necessary (possibly node guarantees a full read, but that is
// not specified in the documentation and it is painless to use a
// 'while' block rather than an 'if' block in case it doesn't)
while (eol === this._end) {
if (this._pos === 0 && this._end === BUF_SIZE) {
// line length has exceeded or equalled BUF_SIZE
this.close();
throw new Error("Binary file detected: if this really is a line structured "
+ "file, consider increasing BUF_SIZE");
}
var fetched;
try {
fetched = this._refill();
}
catch (e) {
this.close();
throw e;
}
if (fetched === 0) { // end-of-file
// _refill() can reset _pos and _end, so we need to reset
// eol even though fetched === 0
eol = this._end;
this._eof = true;
break;
}
eol = this._find_eol();
}
var begin = this._pos;
// don't return an empty string representing the phantom output of
// the last line of the file with concluding '\n'
if (this._eof === true && begin === eol) {
this.close();
throw "EOF";
}
this._pos = eol;
// swallow '\n' for next read: we don't need to check whether
// this._pos < this._end, because that could only happen if we
// have reached end-of-file with no terminating '\n', and any
// further attempts to call this method will throw "EOF" in case
// of end-of-file
++this._pos;
// also cater for DOS style line endings
if (eol && this._buffer[eol - 1] === 0xD) // '\r'
--eol;
return this._buffer.toString("utf8", begin, eol);
}
GetlineSync.prototype.close = function () {
if (this._buffer !== null) {
try {fs.closeSync(this._fd);}
catch (e) {}
this._buffer = null;
}
}
// *** GetlineAsync class ***
// The GetlineAsync class is an asynchronous version of GetlineSync,
// and often fits better into node.js's event based approach to
// program design than GetlineSync. The constructor of this class
// takes a file descriptor, not a filename, otherwise a user may be
// presented with a partially constructed object if implemented with
// asynchronous fs.open(). Use fs.open()/fs.openSync() to obtain a
// file descriptor to pass in, as appropriate to the usage case.
//
// As in the case of GetlineSync, the file concerned must be in utf8
// or ascii encoding.
//
// Like GetlineSync, GetlineAsync has a next() method to iterate
// through the file line by line. However, next() does not return
// anything. Instead, GetlineAsync derives from EmitterEvent, and
// emits "line" following a call to next(), once the line has been
// fully received and assembled. The "line" event callback is passed
// three arguments: first an Error value (which will be null if there
// is no Error), secondly a boolean value indicating whether
// end-of-file has been reached, as end-of-file is not an error (in
// which case the third argument can be ignored, as it will be an
// empty string), and thirdly a string containing the line extracted,
// without terminating '\n'
//
// After the initial call to the next() method of GetlineAsync,
// further calls to that method should only be made once the "line"
// event callback for the previous call has begun executing: in other
// words, further calls should be made within the "line" event
// callback function or, say, in a callback chained to asynchronous
// calls made by that function.
//
// The underlying file descriptor will be closed and the buffer
// released on end-of-file being reached or an error arising and, to
// release other resources, any "line" event callbacks will
// automatically be removed once relevant emissions have been made
// informing listeners of the end-of-file or error. If the user
// finishes iterating before either of those events, GetlineAsync's
// close() method can be called to achieve the same result.
//
// The only public methods that GetlineAsync has are next() and
// close().
exports.GetlineAsync = GetlineAsync;
function GetlineAsync(fd) {
events.EventEmitter.call(this);
this._fd = fd;
this._buffer = new Buffer(BUF_SIZE);
this._end = 0; // after any get, one past last loaded element in buffer
this._pos = 0; // after any get, one past last '\n' found, or one
// past last byte in file
this._eof = false;
this._reading = false;
}
// this does the same as util.inherits, except that
// GetlineAsync.prototype.constructor is enumerable on the prototype
// object, which I prefer in these cases, and this approach also makes
// it more obvious what is happening, which makes me happier
GetlineAsync.prototype = Object.create(events.EventEmitter.prototype);
GetlineAsync.prototype.constructor = GetlineAsync;
// the callback is passed to fs.read(), and has three arguments
GetlineAsync.prototype._refill = function (cb) {
// first move any part line at the end of the buffer to the front
// of the buffer
var i = 0;
while (this._pos < this._end) {
this._buffer[i++] = this._buffer[this._pos++];
}
this._pos = 0;
this._end = i;
// now refill the remainder of the buffer
fs.read(this._fd, // file descriptor
this._buffer, // byte buffer
this._end, // buffer position of start of load
BUF_SIZE - this._end, // max length of load
null, // OS file position: null = current pos
cb);
}
// returns position in buffer of next '\n' byte in buffer, or
// this._end if none found. It does not mutate any object data
GetlineAsync.prototype._find_eol = function () {
var i = this._pos;
for (; i < this._end; ++i) {
if (this._buffer[i] === 0xA) { // '\n'
break;
}
}
return i;
}
GetlineAsync.prototype._emit_line = function (eol) {
var begin = this._pos;
this._reading = false;
// don't return an empty string representing the phantom output of
// the last line of the file with concluding '\n'
if (this._eof === true && begin === eol) {
this.emit("line", null, true, "");
this.close();
}
else {
this._pos = eol;
// swallow '\n' for next read: we don't need to check whether
// this._pos < this._end, because that could only happen if we
// have reached end-of-file with no terminating '\n', and any
// further attempts to call next() after end-of-file will only
// cause an end-of-file argument to be emitted
++this._pos;
// also cater for DOS style line endings
if (eol && this._buffer[eol - 1] === 0xD) // '\r'
--eol;
// Emit via process.nextTick() to avoid excessive recursion on
// next() calls where asynchronous _refill() isn't called
var self = this;
var line = this._buffer.toString("utf8", begin, eol);
process.nextTick(function () {
self.emit("line", null, false, line);
});
}
}
GetlineAsync.prototype.next = function () {
// check pre-conditions
if (this._reading) { // next() has been called outside the "line" event callback
this.emit("line",
new Error("GetlineAsync read operation already in course"),
false, "");
this.close(); // also sets _reading to false
return;
}
if (this._eof) {
this.emit("line", null, true, "");
this.close();
return;
}
if (this._buffer === null) {
this.emit("line",
new Error("GetlineAsync file is closed"),
false, "");
return;
}
this._reading = true;
// unix read(), and so _refill(), can return less than the number
// of bytes requested, so loop on _refill() as necessary (possibly
// node guarantees a full read, but that is not specified in the
// documentation so call a nested loop function in case it
// doesn't)
var self = this;
function loop () {
var eol = self._find_eol();
if (eol === self._end) {
if (self._pos === 0 && self._end === BUF_SIZE) {
// line length has exceeded or equalled BUF_SIZE
self.emit("line",
new Error("Binary file detected: if this really is a line structured "
+ "file, consider increasing BUF_SIZE"),
false,
"");
self.close();
}
else {
self._refill(function (err, fetched) {
if (err) {
self.emit("line",
err,
false,
"");
self.close();
}
// check whether we still hold our async guard and _buffer still
// exists, because if the user has called next() incorrectly so
// the first pre-condition above has not been met, an error will
// have been propagated and the buffer released (and similarly if
// close() has been called outside a "line" event callback)
else if (self._reading) {
if (fetched === 0) { // end-of-file
self._eof = true;
// as we have called _refill(), which resets _pos and _end,
// eol may no longer hold the same value as _end, so just pass
// _end instead of calling _find_eol() again (which would give
// the same result less efficiently)
self._emit_line(self._end);
}
else {
self._end += fetched;
loop(); // loop to see if we have read enough bytes to find eol
}
}
});
}
}
else {
self._emit_line(eol);
}
} // end of loop() function definition
loop(); // initiate loop
}
GetlineAsync.prototype.close = function () {
if (this._buffer !== null) {
fs.close(this._fd);
this._buffer = null;
this._reading = false;
// go via event loop to allow a set of error/eof messages
// before disconnecting listeners
var self = this;
process.nextTick(function () {self.removeAllListeners();});
}
}