chonkie
Version:
🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.
218 lines • 8.32 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.RecursiveChunk = exports.RecursiveRules = exports.RecursiveLevel = void 0;
const base_1 = require("./base");
/** Class to represent recursive chunking rules at a specific level
*
* @class RecursiveLevel
* @property {string | string[]} [delimiters] - The delimiters to use for chunking.
* @property {boolean} [whitespace] - Whether to use whitespace as a delimiter.
* @property {IncludeDelim} [includeDelim] - Whether to include the delimiter in the previous or next chunk.
*/
class RecursiveLevel {
/**
* Constructs a new RecursiveLevel object.
*
* @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from.
*/
constructor(data = {}) {
var _a, _b;
this.delimiters = data.delimiters;
this.whitespace = (_a = data.whitespace) !== null && _a !== void 0 ? _a : false;
this.includeDelim = (_b = data.includeDelim) !== null && _b !== void 0 ? _b : 'prev';
this.validate();
}
/**
* Validates the RecursiveLevel object.
*
* @private
*/
validate() {
if (this.delimiters !== undefined && this.whitespace) {
throw new Error('Cannot use whitespace as a delimiter and also specify custom delimiters.');
}
if (this.delimiters !== undefined) {
if (typeof this.delimiters === 'string' && this.delimiters.length === 0) {
throw new Error('Custom delimiters cannot be an empty string.');
}
if (Array.isArray(this.delimiters)) {
if (this.delimiters.some(delim => typeof delim !== 'string' || delim.length === 0)) {
throw new Error('Custom delimiters cannot be an empty string.');
}
if (this.delimiters.includes(' ')) {
throw new Error('Custom delimiters cannot be whitespace only. Set whitespace to true instead.');
}
}
}
}
/** Return a string representation of the RecursiveLevel
*
* @returns {string} The string representation of the RecursiveLevel.
*/
toString() {
return `RecursiveLevel(delimiters=${this.delimiters}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`;
}
/** Return the RecursiveLevel as a dictionary-like object
*
* @returns {RecursiveLevelData} The dictionary-like object.
*/
toDict() {
return {
delimiters: this.delimiters,
whitespace: this.whitespace,
includeDelim: this.includeDelim,
};
}
/** Create RecursiveLevel object from a dictionary
*
* @param {RecursiveLevelData} data - The dictionary-like object.
* @returns {RecursiveLevel} The RecursiveLevel object.
*/
static fromDict(data) {
return new RecursiveLevel(data);
}
/** Create RecursiveLevel object from a recipe
*
* @param {string} name - The name of the recipe.
* @param {string} lang - The language of the recipe.
* @returns {Promise<RecursiveLevel>} The RecursiveLevel object.
*/
static fromRecipe(name_1) {
return __awaiter(this, arguments, void 0, function* (name, lang = 'en') {
// TODO: Implement Hubbie integration
throw new Error('Not implemented');
});
}
}
exports.RecursiveLevel = RecursiveLevel;
/** Class to represent recursive chunking rules
*
* @class RecursiveRules
* @property {RecursiveLevel[]} [levels] - The recursive levels.
*/
class RecursiveRules {
constructor(data = {}) {
if (data.levels === undefined) {
// Default levels
const paragraphs = new RecursiveLevel({ delimiters: ['\n\n', '\r\n', '\n', '\r'] });
const sentences = new RecursiveLevel({ delimiters: ['. ', '! ', '? '] });
const pauses = new RecursiveLevel({
delimiters: [
'{', '}', '"', '[', ']', '<', '>', '(', ')', ':', ';', ',',
'—', '|', '~', '-', '...', '`', "'",
],
});
const word = new RecursiveLevel({ whitespace: true });
const token = new RecursiveLevel();
this.levels = [paragraphs, sentences, pauses, word, token];
}
else {
this.levels = data.levels.map(level => new RecursiveLevel(level));
}
}
/** Return a string representation of the RecursiveRules
*
* @returns {string} The string representation of the RecursiveRules.
*/
toString() {
return `RecursiveRules(levels=${this.levels})`;
}
/** Return the number of levels
*
* @returns {number} The number of levels.
*/
get length() {
return this.levels.length;
}
/** Get a level by index
*
* @param {number} index - The index of the level.
* @returns {RecursiveLevel | undefined} The level.
*/
getLevel(index) {
return this.levels[index];
}
/** Return an iterator over the levels
*
* @returns {Iterator<RecursiveLevel>} The iterator over the levels.
*/
[Symbol.iterator]() {
return this.levels[Symbol.iterator]();
}
/** Create a RecursiveRules object from a dictionary
*
* @param {RecursiveRulesData} data - The dictionary-like object.
* @returns {RecursiveRules} The RecursiveRules object.
*/
static fromDict(data) {
return new RecursiveRules(data);
}
/** Return the RecursiveRules as a dictionary-like object
*
* @returns {RecursiveRulesData} The dictionary-like object.
*/
toDict() {
return {
levels: this.levels.map(level => level.toDict()),
};
}
/** Create a RecursiveRules object from a recipe
*
* @param {string} name - The name of the recipe.
* @param {string} lang - The language of the recipe.
* @param {string} path - The path to the recipe.
* @returns {Promise<RecursiveRules>} The RecursiveRules object.
*/
static fromRecipe() {
return __awaiter(this, arguments, void 0, function* (name = 'default', lang = 'en', path) {
// TODO: Implement Hubbie integration
throw new Error('Not implemented');
});
}
}
exports.RecursiveRules = RecursiveRules;
/** Class to represent recursive chunks
*
* @class RecursiveChunk
* @property {number} [level] - The level of recursion for the chunk.
*/
class RecursiveChunk extends base_1.Chunk {
constructor(data) {
super(data);
this.level = data.level;
}
/** Return a string representation of the RecursiveChunk
*
* @returns {string} The string representation of the RecursiveChunk.
*/
toString() {
return `RecursiveChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, level=${this.level})`;
}
/** Return the RecursiveChunk as a dictionary-like object
*
* @returns {RecursiveChunkData} The dictionary-like object.
*/
toDict() {
const baseDict = super.toDict();
return Object.assign(Object.assign({}, baseDict), { level: this.level });
}
/** Create a RecursiveChunk object from a dictionary
*
* @param {RecursiveChunkData} data - The dictionary-like object.
* @returns {RecursiveChunk} The RecursiveChunk object.
*/
static fromDict(data) {
return new RecursiveChunk(data);
}
}
exports.RecursiveChunk = RecursiveChunk;
//# sourceMappingURL=recursive.js.map