UNPKG

@7-docs/cli

Version:

Command-line tool for 7-docs to ingest content

95 lines (94 loc) 4.14 kB
import { OpenAI } from '@7-docs/edge'; import { CHUNK_SIZE, OPENAI_EMBEDDING_MODEL } from '@7-docs/shared'; import { Algolia } from '../client/algolia.js'; import { Pinecone } from '../client/pinecone.js'; import { Supabase } from '../client/supabase.js'; import { OPENAI_API_KEY } from '../env.js'; import { fetchDocuments, sources } from '../fetcher/index.js'; import { parseDocument } from '../parser/index.js'; import { generateId } from '../util/crypto.js'; import ora from '../util/ora.js'; import { getInitUsage, addTokens } from '../util/usage.js'; const targets = { Pinecone, Supabase, Algolia }; const dummyEmbedding = { embeddings: [], usage: getInitUsage() }; const isValidSource = (source) => Boolean(source && source in sources); const isValidTarget = (target) => Boolean(target && target in targets); export const ingest = async (options) => { const { source, sourceIdentifiers, ignore, repo, db, namespace, isDryRun, isSkipEmbeddings } = options; if (!isValidSource(source)) throw new Error(`Invalid --source: ${source}`); if (!isValidTarget(db)) throw new Error(`Invalid --db: ${db}`); if (source === 'github' && !repo) throw new Error('No --repo provided'); const client = new OpenAI(OPENAI_API_KEY); const spinner = ora(`Fetching files`).start(); const files = await fetchDocuments(source, sourceIdentifiers, { repo, ignore }); spinner.succeed(); if (files.length > 0) { const spinner = ora('Creating and upserting vector embeddings').start(); const DB = new targets[db](); const counters = { files: files.length, vectors: 0, usage: getInitUsage() }; try { for (const file of files) { const { content, url, filePath } = file; if (!content) continue; spinner.text = `Creating and upserting vector embedding for: ${filePath}`; const { title, sections } = await parseDocument(filePath, content, CHUNK_SIZE); if (isDryRun) continue; const requests = sections.map(section => { return isSkipEmbeddings ? dummyEmbedding : client.createEmbeddings({ input: section.content, model: OPENAI_EMBEDDING_MODEL }); }); const responses = await Promise.all(requests); const embeddings = responses.flatMap(response => response.embeddings); const vectors = sections.map((section, index) => { const values = embeddings[index] || []; const id = generateId(filePath + '\n' + section.content.trim()); const metadata = { title, url, filePath, content: section.content, header: section.header, tags: section.tags }; return { id, values, metadata }; }); const insertedVectorCount = await DB.upsertVectors({ namespace, vectors }); counters.vectors += insertedVectorCount; const usages = responses.map(response => response.usage); counters.usage = addTokens(counters.usage, usages); } spinner.succeed('Creating and upserting vectors'); } catch (error) { if (error instanceof Error) spinner.fail(error.message); else throw error; } finally { const messages = [ `Fetched ${counters.files} file(s) from ${source}`, `used ${counters.usage.total_tokens} OpenAI tokens`, `upserted ${counters.vectors} vectors to ${db}` ]; ora(messages.join(', ')).info(); } } else { throw new Error(`Unable to find files to ingest (source: ${source}, patterns: ${sourceIdentifiers.join(',')})`); } };