UNPKG

chonkie

Version:

🦛 CHONK your texts in TS with Chonkie!✨The no-nonsense lightweight and efficient chunking library.

130 lines • 6.34 kB
"use strict"; /** Semantic chunker client for Chonkie API. */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.SemanticChunker = void 0; const base_1 = require("./base"); const semantic_1 = require("../types/semantic"); const fs = __importStar(require("fs")); const path = __importStar(require("path")); class SemanticChunker extends base_1.CloudClient { constructor(apiKey, config = {}) { var _a, _b; super({ apiKey }); this.config = { embeddingModel: config.embeddingModel || "minishlab/potion-base-8M", threshold: (_a = config.threshold) !== null && _a !== void 0 ? _a : "auto", chunkSize: config.chunkSize || 512, similarityWindow: config.similarityWindow || 1, minSentences: config.minSentences || 1, minChunkSize: config.minChunkSize || 2, minCharactersPerSentence: config.minCharactersPerSentence || 12, thresholdStep: config.thresholdStep || 0.01, delim: config.delim || [".", "!", "?", "\n"], includeDelim: (_b = config.includeDelim) !== null && _b !== void 0 ? _b : "prev", }; } chunk(input) { return __awaiter(this, void 0, void 0, function* () { const formData = new FormData(); if (input.filepath) { const fileContent = fs.readFileSync(input.filepath); const fileName = path.basename(input.filepath) || 'file.txt'; formData.append("file", new Blob([fileContent]), fileName); } else if (input.text) { // JSON encode the text formData.append("text", JSON.stringify(input.text)); // Append empty file to ensure multipart form formData.append("file", new Blob(), "text_input.txt"); } else { throw new Error("Either text or filepath must be provided"); } // Add all config options to the form data formData.append("embedding_model", this.config.embeddingModel); formData.append("threshold", this.config.threshold.toString()); formData.append("chunk_size", this.config.chunkSize.toString()); formData.append("similarity_window", this.config.similarityWindow.toString()); formData.append("min_sentences", this.config.minSentences.toString()); formData.append("min_chunk_size", this.config.minChunkSize.toString()); formData.append("min_characters_per_sentence", this.config.minCharactersPerSentence.toString()); formData.append("threshold_step", this.config.thresholdStep.toString()); formData.append("delim", JSON.stringify(this.config.delim)); formData.append("include_delim", this.config.includeDelim || "prev"); formData.append("return_type", "chunks"); const data = yield this.request("/v1/chunk/semantic", { method: "POST", body: formData, }); // Convert from snake_case to camelCase const camelCaseData = data.map((chunk) => { return { text: chunk.text, startIndex: chunk.start_index, endIndex: chunk.end_index, tokenCount: chunk.token_count, embedding: chunk.embedding || undefined, sentences: chunk.sentences.map((sentence) => { return { text: sentence.text, startIndex: sentence.start_index, endIndex: sentence.end_index, tokenCount: sentence.token_count, embedding: sentence.embedding || undefined, }; }), }; }); return camelCaseData.map((chunk) => semantic_1.SemanticChunk.fromDict(chunk)); }); } chunkBatch(inputs) { return __awaiter(this, void 0, void 0, function* () { return Promise.all(inputs.map(input => this.chunk(input))); }); } } exports.SemanticChunker = SemanticChunker; //# sourceMappingURL=semantic.js.map