UNPKG

@genkit-ai/ai

Version:

Genkit AI framework generative AI APIs.

152 lines 3.96 kB
import { z } from "@genkit-ai/core"; const EmptyPartSchema = z.object({ text: z.never().optional(), media: z.never().optional() }); const TextPartSchema = EmptyPartSchema.extend({ /** The text of the document. */ text: z.string() }); const MediaPartSchema = EmptyPartSchema.extend({ media: z.object({ /** The media content type. Inferred from data uri if not provided. */ contentType: z.string().optional(), /** A `data:` or `https:` uri containing the media content. */ url: z.string() }) }); const PartSchema = z.union([TextPartSchema, MediaPartSchema]); const DocumentDataSchema = z.object({ content: z.array(PartSchema), metadata: z.record(z.string(), z.any()).optional() }); function deepCopy(value) { if (value === void 0) { return value; } return JSON.parse(JSON.stringify(value)); } class Document { content; metadata; constructor(data) { this.content = deepCopy(data.content); this.metadata = deepCopy(data.metadata); } static fromText(text, metadata) { return new Document({ content: [{ text }], metadata }); } // Construct a Document from a single media item static fromMedia(url, contentType, metadata) { return new Document({ content: [ { media: { contentType, url } } ], metadata }); } // Construct a Document from content static fromData(data, dataType, metadata) { if (dataType === "text") { return this.fromText(data, metadata); } return this.fromMedia(data, dataType, metadata); } /** * Concatenates all `text` parts present in the document with no delimiter. * @returns A string of all concatenated text parts. */ get text() { return this.content.map((part) => part.text || "").join(""); } /** * Media array getter. * @returns the array of media parts. */ get media() { return this.content.filter((part) => part.media && !part.text).map((part) => part.media); } /** * Gets the first item in the document. Either text or media url. */ get data() { if (this.text) { return this.text; } if (this.media) { return this.media[0].url; } return ""; } /** * Gets the contentType of the data that is returned by data() */ get dataType() { if (this.text) { return "text"; } if (this.media && this.media[0].contentType) { return this.media[0].contentType; } return void 0; } toJSON() { return { content: deepCopy(this.content), metadata: deepCopy(this.metadata) }; } /** * Embedders may return multiple embeddings for a single document. * But storage still requires a 1:1 relationship. So we create an * array of Documents from a single document - one per embedding. * @param embeddings The embeddings to create the documents from. * @returns an array of documents based on this document and the embeddings. */ getEmbeddingDocuments(embeddings) { let documents = []; for (const embedding of embeddings) { let jsonDoc = this.toJSON(); if (embedding.metadata) { if (!jsonDoc.metadata) { jsonDoc.metadata = {}; } jsonDoc.metadata.embedMetadata = embedding.metadata; } documents.push(new Document(jsonDoc)); } checkUniqueDocuments(documents); return documents; } } function checkUniqueDocuments(documents) { const seen = /* @__PURE__ */ new Set(); for (const doc of documents) { const serialized = JSON.stringify(doc); if (seen.has(serialized)) { console.warn( "Warning: embedding documents are not unique. Are you missing embed metadata?" ); return false; } seen.add(serialized); } return true; } export { Document, DocumentDataSchema, MediaPartSchema, PartSchema, TextPartSchema, checkUniqueDocuments }; //# sourceMappingURL=document.mjs.map