kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
79 lines (69 loc) • 2.5 kB
JavaScript
"use strict";
/**
* String wrapper for UTF-16 surrogate pair (4 bytes)
* @param {string} str String to wrap
* @constructor
*/
function SurrogateAwareString(str) {
this.str = str;
this.index_mapping = [];
for (let pos = 0; pos < str.length; pos++) {
let ch = str.charAt(pos);
this.index_mapping.push(pos);
if (SurrogateAwareString.isSurrogatePair(ch)) {
pos++;
}
}
// Surrogate aware length
this.length = this.index_mapping.length;
}
SurrogateAwareString.prototype.slice = function (index) {
if (this.index_mapping.length <= index) {
return "";
}
let surrogate_aware_index = this.index_mapping[index];
return this.str.slice(surrogate_aware_index);
};
SurrogateAwareString.prototype.charAt = function (index) {
// Guard against the surrogate-aware (code-point) length, not str.length (UTF-16
// code units) — for astral-bearing strings str.length > index_mapping.length, so a
// code-unit guard lets an out-of-range index through and `slice(undefined)` returns
// the whole string. Mirror the correct guards in slice()/charCodeAt().
if (this.index_mapping.length <= index) {
return "";
}
let surrogate_aware_start_index = this.index_mapping[index];
let surrogate_aware_end_index = this.index_mapping[index + 1];
if (surrogate_aware_end_index == null) {
return this.str.slice(surrogate_aware_start_index);
}
return this.str.slice(surrogate_aware_start_index, surrogate_aware_end_index);
};
SurrogateAwareString.prototype.charCodeAt = function (index) {
if (this.index_mapping.length <= index) {
return NaN;
}
let surrogate_aware_index = this.index_mapping[index];
let upper = this.str.charCodeAt(surrogate_aware_index);
let lower;
if (upper >= 0xD800 && upper <= 0xDBFF && surrogate_aware_index < this.str.length) {
lower = this.str.charCodeAt(surrogate_aware_index + 1);
if (lower >= 0xDC00 && lower <= 0xDFFF) {
return (upper - 0xD800) * 0x400 + lower - 0xDC00 + 0x10000;
}
}
return upper;
};
SurrogateAwareString.prototype.toString = function () {
return this.str;
};
SurrogateAwareString.isSurrogatePair = function (ch) {
let utf16_code = ch.charCodeAt(0);
if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) {
// surrogate pair
return true;
} else {
return false;
}
};
module.exports = SurrogateAwareString;