UNPKG

chonkie

Version:

🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.

218 lines • 8.32 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.RecursiveChunk = exports.RecursiveRules = exports.RecursiveLevel = void 0; const base_1 = require("./base"); /** Class to represent recursive chunking rules at a specific level * * @class RecursiveLevel * @property {string | string[]} [delimiters] - The delimiters to use for chunking. * @property {boolean} [whitespace] - Whether to use whitespace as a delimiter. * @property {IncludeDelim} [includeDelim] - Whether to include the delimiter in the previous or next chunk. */ class RecursiveLevel { /** * Constructs a new RecursiveLevel object. * * @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from. */ constructor(data = {}) { var _a, _b; this.delimiters = data.delimiters; this.whitespace = (_a = data.whitespace) !== null && _a !== void 0 ? _a : false; this.includeDelim = (_b = data.includeDelim) !== null && _b !== void 0 ? _b : 'prev'; this.validate(); } /** * Validates the RecursiveLevel object. * * @private */ validate() { if (this.delimiters !== undefined && this.whitespace) { throw new Error('Cannot use whitespace as a delimiter and also specify custom delimiters.'); } if (this.delimiters !== undefined) { if (typeof this.delimiters === 'string' && this.delimiters.length === 0) { throw new Error('Custom delimiters cannot be an empty string.'); } if (Array.isArray(this.delimiters)) { if (this.delimiters.some(delim => typeof delim !== 'string' || delim.length === 0)) { throw new Error('Custom delimiters cannot be an empty string.'); } if (this.delimiters.includes(' ')) { throw new Error('Custom delimiters cannot be whitespace only. Set whitespace to true instead.'); } } } } /** Return a string representation of the RecursiveLevel * * @returns {string} The string representation of the RecursiveLevel. */ toString() { return `RecursiveLevel(delimiters=${this.delimiters}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`; } /** Return the RecursiveLevel as a dictionary-like object * * @returns {RecursiveLevelData} The dictionary-like object. */ toDict() { return { delimiters: this.delimiters, whitespace: this.whitespace, includeDelim: this.includeDelim, }; } /** Create RecursiveLevel object from a dictionary * * @param {RecursiveLevelData} data - The dictionary-like object. * @returns {RecursiveLevel} The RecursiveLevel object. */ static fromDict(data) { return new RecursiveLevel(data); } /** Create RecursiveLevel object from a recipe * * @param {string} name - The name of the recipe. * @param {string} lang - The language of the recipe. * @returns {Promise<RecursiveLevel>} The RecursiveLevel object. */ static fromRecipe(name_1) { return __awaiter(this, arguments, void 0, function* (name, lang = 'en') { // TODO: Implement Hubbie integration throw new Error('Not implemented'); }); } } exports.RecursiveLevel = RecursiveLevel; /** Class to represent recursive chunking rules * * @class RecursiveRules * @property {RecursiveLevel[]} [levels] - The recursive levels. */ class RecursiveRules { constructor(data = {}) { if (data.levels === undefined) { // Default levels const paragraphs = new RecursiveLevel({ delimiters: ['\n\n', '\r\n', '\n', '\r'] }); const sentences = new RecursiveLevel({ delimiters: ['. ', '! ', '? '] }); const pauses = new RecursiveLevel({ delimiters: [ '{', '}', '"', '[', ']', '<', '>', '(', ')', ':', ';', ',', '—', '|', '~', '-', '...', '`', "'", ], }); const word = new RecursiveLevel({ whitespace: true }); const token = new RecursiveLevel(); this.levels = [paragraphs, sentences, pauses, word, token]; } else { this.levels = data.levels.map(level => new RecursiveLevel(level)); } } /** Return a string representation of the RecursiveRules * * @returns {string} The string representation of the RecursiveRules. */ toString() { return `RecursiveRules(levels=${this.levels})`; } /** Return the number of levels * * @returns {number} The number of levels. */ get length() { return this.levels.length; } /** Get a level by index * * @param {number} index - The index of the level. * @returns {RecursiveLevel | undefined} The level. */ getLevel(index) { return this.levels[index]; } /** Return an iterator over the levels * * @returns {Iterator<RecursiveLevel>} The iterator over the levels. */ [Symbol.iterator]() { return this.levels[Symbol.iterator](); } /** Create a RecursiveRules object from a dictionary * * @param {RecursiveRulesData} data - The dictionary-like object. * @returns {RecursiveRules} The RecursiveRules object. */ static fromDict(data) { return new RecursiveRules(data); } /** Return the RecursiveRules as a dictionary-like object * * @returns {RecursiveRulesData} The dictionary-like object. */ toDict() { return { levels: this.levels.map(level => level.toDict()), }; } /** Create a RecursiveRules object from a recipe * * @param {string} name - The name of the recipe. * @param {string} lang - The language of the recipe. * @param {string} path - The path to the recipe. * @returns {Promise<RecursiveRules>} The RecursiveRules object. */ static fromRecipe() { return __awaiter(this, arguments, void 0, function* (name = 'default', lang = 'en', path) { // TODO: Implement Hubbie integration throw new Error('Not implemented'); }); } } exports.RecursiveRules = RecursiveRules; /** Class to represent recursive chunks * * @class RecursiveChunk * @property {number} [level] - The level of recursion for the chunk. */ class RecursiveChunk extends base_1.Chunk { constructor(data) { super(data); this.level = data.level; } /** Return a string representation of the RecursiveChunk * * @returns {string} The string representation of the RecursiveChunk. */ toString() { return `RecursiveChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, level=${this.level})`; } /** Return the RecursiveChunk as a dictionary-like object * * @returns {RecursiveChunkData} The dictionary-like object. */ toDict() { const baseDict = super.toDict(); return Object.assign(Object.assign({}, baseDict), { level: this.level }); } /** Create a RecursiveChunk object from a dictionary * * @param {RecursiveChunkData} data - The dictionary-like object. * @returns {RecursiveChunk} The RecursiveChunk object. */ static fromDict(data) { return new RecursiveChunk(data); } } exports.RecursiveChunk = RecursiveChunk; //# sourceMappingURL=recursive.js.map