UNPKG

@genkit-ai/ai

Version:

Genkit AI framework generative AI APIs.

1 lines 8.61 kB
{"version":3,"sources":["../src/document.ts"],"sourcesContent":["/**\n * Copyright 2024 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport { z } from '@genkit-ai/core';\nimport { Embedding } from './embedder';\n\nconst EmptyPartSchema = z.object({\n text: z.never().optional(),\n media: z.never().optional(),\n});\n\nexport const TextPartSchema = EmptyPartSchema.extend({\n /** The text of the document. */\n text: z.string(),\n});\nexport type TextPart = z.infer<typeof TextPartSchema>;\n\nexport const MediaPartSchema = EmptyPartSchema.extend({\n media: z.object({\n /** The media content type. Inferred from data uri if not provided. */\n contentType: z.string().optional(),\n /** A `data:` or `https:` uri containing the media content. */\n url: z.string(),\n }),\n});\nexport type MediaPart = z.infer<typeof MediaPartSchema>;\n\nexport const PartSchema = z.union([TextPartSchema, MediaPartSchema]);\nexport type Part = z.infer<typeof PartSchema>;\n\n// We need both metadata and embedMetadata because they can\n// contain the same fields (e.g. video start/stop) with different values.\nexport const DocumentDataSchema = z.object({\n content: z.array(PartSchema),\n metadata: z.record(z.string(), z.any()).optional(),\n});\nexport type DocumentData = z.infer<typeof DocumentDataSchema>;\n\nfunction deepCopy<T>(value: T): T {\n if (value === undefined) {\n return value;\n }\n return JSON.parse(JSON.stringify(value)) as T;\n}\n\n/**\n * Document represents document content along with its metadata that can be embedded, indexed or\n * retrieved. Each document can contain multiple parts (for example text and an image)\n */\nexport class Document implements DocumentData {\n content: Part[];\n metadata?: Record<string, any>;\n\n constructor(data: DocumentData) {\n this.content = deepCopy(data.content);\n this.metadata = deepCopy(data.metadata);\n }\n\n static fromText(text: string, metadata?: Record<string, any>) {\n return new Document({\n content: [{ text }],\n metadata,\n });\n }\n\n // Construct a Document from a single media item\n static fromMedia(\n url: string,\n contentType?: string,\n metadata?: Record<string, unknown>\n ) {\n return new Document({\n content: [\n {\n media: {\n contentType,\n url,\n },\n },\n ],\n metadata,\n });\n }\n\n // Construct a Document from content\n static fromData(\n data: string,\n dataType?: string,\n metadata?: Record<string, unknown>\n ) {\n if (dataType === 'text') {\n return this.fromText(data, metadata);\n }\n return this.fromMedia(data, dataType, metadata);\n }\n\n /**\n * Concatenates all `text` parts present in the document with no delimiter.\n * @returns A string of all concatenated text parts.\n */\n get text(): string {\n return this.content.map((part) => part.text || '').join('');\n }\n\n /**\n * Media array getter.\n * @returns the array of media parts.\n */\n get media(): { url: string; contentType?: string }[] {\n return this.content\n .filter((part) => part.media && !part.text)\n .map((part) => part.media!);\n }\n\n /**\n * Gets the first item in the document. Either text or media url.\n */\n get data(): string {\n //\n if (this.text) {\n return this.text;\n }\n if (this.media) {\n return this.media[0].url;\n }\n return '';\n }\n\n /**\n * Gets the contentType of the data that is returned by data()\n */\n get dataType(): string | undefined {\n if (this.text) {\n return 'text';\n }\n if (this.media && this.media[0].contentType) {\n return this.media[0].contentType;\n }\n return undefined;\n }\n\n toJSON(): DocumentData {\n return {\n content: deepCopy(this.content),\n metadata: deepCopy(this.metadata),\n } as DocumentData;\n }\n\n /**\n * Embedders may return multiple embeddings for a single document.\n * But storage still requires a 1:1 relationship. So we create an\n * array of Documents from a single document - one per embedding.\n * @param embeddings The embeddings to create the documents from.\n * @returns an array of documents based on this document and the embeddings.\n */\n getEmbeddingDocuments(embeddings: Embedding[]): Document[] {\n let documents: Document[] = [];\n for (const embedding of embeddings) {\n let jsonDoc = this.toJSON();\n if (embedding.metadata) {\n if (!jsonDoc.metadata) {\n jsonDoc.metadata = {};\n }\n jsonDoc.metadata.embedMetadata = embedding.metadata;\n }\n documents.push(new Document(jsonDoc));\n }\n checkUniqueDocuments(documents);\n return documents;\n }\n}\n\n// Unique documents are important because we key\n// our vector storage on the Md5 hash of the JSON.stringify(document)\n// So if we have multiple duplicate documents with\n// different embeddings, we will either skip or overwrite\n// those entries and lose embedding information.\n// Export and boolean return value for testing only.\nexport function checkUniqueDocuments(documents: Document[]): boolean {\n const seen = new Set();\n for (const doc of documents) {\n const serialized = JSON.stringify(doc);\n if (seen.has(serialized)) {\n console.warn(\n 'Warning: embedding documents are not unique. Are you missing embed metadata?'\n );\n return false;\n }\n seen.add(serialized);\n }\n return true;\n}\n"],"mappings":"AAgBA,SAAS,SAAS;AAGlB,MAAM,kBAAkB,EAAE,OAAO;AAAA,EAC/B,MAAM,EAAE,MAAM,EAAE,SAAS;AAAA,EACzB,OAAO,EAAE,MAAM,EAAE,SAAS;AAC5B,CAAC;AAEM,MAAM,iBAAiB,gBAAgB,OAAO;AAAA;AAAA,EAEnD,MAAM,EAAE,OAAO;AACjB,CAAC;AAGM,MAAM,kBAAkB,gBAAgB,OAAO;AAAA,EACpD,OAAO,EAAE,OAAO;AAAA;AAAA,IAEd,aAAa,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,IAEjC,KAAK,EAAE,OAAO;AAAA,EAChB,CAAC;AACH,CAAC;AAGM,MAAM,aAAa,EAAE,MAAM,CAAC,gBAAgB,eAAe,CAAC;AAK5D,MAAM,qBAAqB,EAAE,OAAO;AAAA,EACzC,SAAS,EAAE,MAAM,UAAU;AAAA,EAC3B,UAAU,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,IAAI,CAAC,EAAE,SAAS;AACnD,CAAC;AAGD,SAAS,SAAY,OAAa;AAChC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AACA,SAAO,KAAK,MAAM,KAAK,UAAU,KAAK,CAAC;AACzC;AAMO,MAAM,SAAiC;AAAA,EAC5C;AAAA,EACA;AAAA,EAEA,YAAY,MAAoB;AAC9B,SAAK,UAAU,SAAS,KAAK,OAAO;AACpC,SAAK,WAAW,SAAS,KAAK,QAAQ;AAAA,EACxC;AAAA,EAEA,OAAO,SAAS,MAAc,UAAgC;AAC5D,WAAO,IAAI,SAAS;AAAA,MAClB,SAAS,CAAC,EAAE,KAAK,CAAC;AAAA,MAClB;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA,EAGA,OAAO,UACL,KACA,aACA,UACA;AACA,WAAO,IAAI,SAAS;AAAA,MAClB,SAAS;AAAA,QACP;AAAA,UACE,OAAO;AAAA,YACL;AAAA,YACA;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAAA;AAAA,EAGA,OAAO,SACL,MACA,UACA,UACA;AACA,QAAI,aAAa,QAAQ;AACvB,aAAO,KAAK,SAAS,MAAM,QAAQ;AAAA,IACrC;AACA,WAAO,KAAK,UAAU,MAAM,UAAU,QAAQ;AAAA,EAChD;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,IAAI,OAAe;AACjB,WAAO,KAAK,QAAQ,IAAI,CAAC,SAAS,KAAK,QAAQ,EAAE,EAAE,KAAK,EAAE;AAAA,EAC5D;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,IAAI,QAAiD;AACnD,WAAO,KAAK,QACT,OAAO,CAAC,SAAS,KAAK,SAAS,CAAC,KAAK,IAAI,EACzC,IAAI,CAAC,SAAS,KAAK,KAAM;AAAA,EAC9B;AAAA;AAAA;AAAA;AAAA,EAKA,IAAI,OAAe;AAEjB,QAAI,KAAK,MAAM;AACb,aAAO,KAAK;AAAA,IACd;AACA,QAAI,KAAK,OAAO;AACd,aAAO,KAAK,MAAM,CAAC,EAAE;AAAA,IACvB;AACA,WAAO;AAAA,EACT;AAAA;AAAA;AAAA;AAAA,EAKA,IAAI,WAA+B;AACjC,QAAI,KAAK,MAAM;AACb,aAAO;AAAA,IACT;AACA,QAAI,KAAK,SAAS,KAAK,MAAM,CAAC,EAAE,aAAa;AAC3C,aAAO,KAAK,MAAM,CAAC,EAAE;AAAA,IACvB;AACA,WAAO;AAAA,EACT;AAAA,EAEA,SAAuB;AACrB,WAAO;AAAA,MACL,SAAS,SAAS,KAAK,OAAO;AAAA,MAC9B,UAAU,SAAS,KAAK,QAAQ;AAAA,IAClC;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,sBAAsB,YAAqC;AACzD,QAAI,YAAwB,CAAC;AAC7B,eAAW,aAAa,YAAY;AAClC,UAAI,UAAU,KAAK,OAAO;AAC1B,UAAI,UAAU,UAAU;AACtB,YAAI,CAAC,QAAQ,UAAU;AACrB,kBAAQ,WAAW,CAAC;AAAA,QACtB;AACA,gBAAQ,SAAS,gBAAgB,UAAU;AAAA,MAC7C;AACA,gBAAU,KAAK,IAAI,SAAS,OAAO,CAAC;AAAA,IACtC;AACA,yBAAqB,SAAS;AAC9B,WAAO;AAAA,EACT;AACF;AAQO,SAAS,qBAAqB,WAAgC;AACnE,QAAM,OAAO,oBAAI,IAAI;AACrB,aAAW,OAAO,WAAW;AAC3B,UAAM,aAAa,KAAK,UAAU,GAAG;AACrC,QAAI,KAAK,IAAI,UAAU,GAAG;AACxB,cAAQ;AAAA,QACN;AAAA,MACF;AACA,aAAO;AAAA,IACT;AACA,SAAK,IAAI,UAAU;AAAA,EACrB;AACA,SAAO;AACT;","names":[]}