langchain
Version:
Typescript bindings for langchain
212 lines (210 loc) • 8.46 kB
JavaScript
import { createMiddleware } from "../middleware.js";
import { z } from "zod/v3";
//#region src/agents/middleware/promptCaching.ts
const DEFAULT_ENABLE_CACHING = true;
const DEFAULT_TTL = "5m";
const DEFAULT_MIN_MESSAGES_TO_CACHE = 3;
const DEFAULT_UNSUPPORTED_MODEL_BEHAVIOR = "warn";
const contextSchema = z.object({
enableCaching: z.boolean().optional(),
ttl: z.enum(["5m", "1h"]).optional(),
minMessagesToCache: z.number().optional(),
unsupportedModelBehavior: z.enum([
"ignore",
"warn",
"raise"
]).optional()
});
var PromptCachingMiddlewareError = class extends Error {
constructor(message) {
super(message);
this.name = "PromptCachingMiddlewareError";
}
};
/**
* Creates a prompt caching middleware for Anthropic models to optimize API usage.
*
* This middleware automatically adds cache control headers to the last messages when using Anthropic models,
* enabling their prompt caching feature. This can significantly reduce costs for applications with repetitive
* prompts, long system messages, or extensive conversation histories.
*
* ## How It Works
*
* The middleware intercepts model requests and adds cache control metadata that tells Anthropic's
* API to cache processed prompt prefixes. On subsequent requests with matching prefixes, the
* cached representations are reused, skipping redundant token processing.
*
* ## Benefits
*
* - **Cost Reduction**: Avoid reprocessing the same tokens repeatedly (up to 90% savings on cached portions)
* - **Lower Latency**: Cached prompts are processed faster as embeddings are pre-computed
* - **Better Scalability**: Reduced computational load enables handling more requests
* - **Consistent Performance**: Stable response times for repetitive queries
*
* @param middlewareOptions - Configuration options for the caching behavior
* @param middlewareOptions.enableCaching - Whether to enable prompt caching (default: `true`)
* @param middlewareOptions.ttl - Cache time-to-live: `"5m"` for 5 minutes or `"1h"` for 1 hour (default: `"5m"`)
* @param middlewareOptions.minMessagesToCache - Minimum number of messages required before caching is applied (default: `3`)
* @param middlewareOptions.unsupportedModelBehavior - The behavior to take when an unsupported model is used (default: `"warn"`)
*
* @returns A middleware instance that can be passed to `createAgent`
*
* @throws {Error} If used with non-Anthropic models
*
* @example
* Basic usage with default settings
* ```typescript
* import { createAgent } from "langchain";
* import { anthropicPromptCachingMiddleware } from "langchain";
*
* const agent = createAgent({
* model: "anthropic:claude-3-5-sonnet",
* middleware: [
* anthropicPromptCachingMiddleware()
* ]
* });
* ```
*
* @example
* Custom configuration for longer conversations
* ```typescript
* const cachingMiddleware = anthropicPromptCachingMiddleware({
* ttl: "1h", // Cache for 1 hour instead of default 5 minutes
* minMessagesToCache: 5 // Only cache after 5 messages
* });
*
* const agent = createAgent({
* model: "anthropic:claude-3-5-sonnet",
* systemPrompt: "You are a helpful assistant with deep knowledge of...", // Long system prompt
* middleware: [cachingMiddleware]
* });
* ```
*
* @example
* Conditional caching based on runtime context
* ```typescript
* const agent = createAgent({
* model: "anthropic:claude-3-5-sonnet",
* middleware: [
* anthropicPromptCachingMiddleware({
* enableCaching: true,
* ttl: "5m"
* })
* ]
* });
*
* // Disable caching for specific requests
* await agent.invoke(
* { messages: [new HumanMessage("Process this without caching")] },
* {
* configurable: {
* middleware_context: { enableCaching: false }
* }
* }
* );
* ```
*
* @example
* Optimal setup for customer support chatbot
* ```typescript
* const supportAgent = createAgent({
* model: "anthropic:claude-3-5-sonnet",
* systemPrompt: `You are a customer support agent for ACME Corp.
*
* Company policies:
* - Always be polite and professional
* - Refer to knowledge base for product information
* - Escalate billing issues to human agents
* ... (extensive policies and guidelines)
* `,
* tools: [searchKnowledgeBase, createTicket, checkOrderStatus],
* middleware: [
* anthropicPromptCachingMiddleware({
* ttl: "1h", // Long TTL for stable system prompt
* minMessagesToCache: 1 // Cache immediately due to large system prompt
* })
* ]
* });
* ```
*
* @remarks
* - **Anthropic Only**: This middleware only works with Anthropic models and will throw an error if used with other providers
* - **Automatic Application**: Caching is applied automatically when message count exceeds `minMessagesToCache`
* - **Cache Scope**: Caches are isolated per API key and cannot be shared across different keys
* - **TTL Options**: Only supports "5m" (5 minutes) and "1h" (1 hour) as TTL values per Anthropic's API
* - **Best Use Cases**: Long system prompts, multi-turn conversations, repetitive queries, RAG applications
* - **Cost Impact**: Cached tokens are billed at 10% of the base input token price, cache writes are billed at 25% of the base
*
* @see {@link createAgent} for agent creation
* @see {@link https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching} Anthropic's prompt caching documentation
* @public
*/
function anthropicPromptCachingMiddleware(middlewareOptions) {
return createMiddleware({
name: "PromptCachingMiddleware",
contextSchema,
wrapModelCall: (request, handler) => {
/**
* Prefer runtime context values over middleware options values over defaults
*/
const enableCaching = request.runtime.context.enableCaching ?? middlewareOptions?.enableCaching ?? DEFAULT_ENABLE_CACHING;
const ttl = request.runtime.context.ttl ?? middlewareOptions?.ttl ?? DEFAULT_TTL;
const minMessagesToCache = request.runtime.context.minMessagesToCache ?? middlewareOptions?.minMessagesToCache ?? DEFAULT_MIN_MESSAGES_TO_CACHE;
const unsupportedModelBehavior = request.runtime.context.unsupportedModelBehavior ?? middlewareOptions?.unsupportedModelBehavior ?? DEFAULT_UNSUPPORTED_MODEL_BEHAVIOR;
if (!enableCaching || !request.model) return handler(request);
const isAnthropicModel = request.model.getName() === "ChatAnthropic" || request.model.getName() === "ConfigurableModel" && request.model._defaultConfig?.modelProvider === "anthropic";
if (!isAnthropicModel) {
const modelName = request.model.getName();
const modelInfo = request.model.getName() === "ConfigurableModel" ? `${modelName} (${request.model._defaultConfig?.modelProvider})` : modelName;
const baseMessage = `Unsupported model '${modelInfo}'. Prompt caching requires an Anthropic model`;
if (unsupportedModelBehavior === "raise") throw new PromptCachingMiddlewareError(`${baseMessage} (e.g., 'anthropic:claude-4-0-sonnet').`);
else if (unsupportedModelBehavior === "warn") console.warn(`PromptCachingMiddleware: Skipping caching for ${modelName}. Consider switching to an Anthropic model for caching benefits.`);
return handler(request);
}
const messagesCount = request.state.messages.length + (request.systemPrompt ? 1 : 0);
if (messagesCount < minMessagesToCache) return handler(request);
/**
* Add cache_control to the last message
*/
const lastMessage = request.messages.at(-1);
if (!lastMessage) return handler(request);
const NewMessageConstructor = Object.getPrototypeOf(lastMessage).constructor;
if (Array.isArray(lastMessage.content)) {
const newMessage = new NewMessageConstructor({
...lastMessage,
content: [...lastMessage.content.slice(0, -1), {
...lastMessage.content.at(-1),
cache_control: {
type: "ephemeral",
ttl
}
}]
});
return handler({
...request,
messages: [...request.messages.slice(0, -1), newMessage]
});
} else if (typeof lastMessage.content === "string") {
const newMessage = new NewMessageConstructor({
...lastMessage,
content: [{
type: "text",
text: lastMessage.content,
cache_control: {
type: "ephemeral",
ttl
}
}]
});
return handler({
...request,
messages: [...request.messages.slice(0, -1), newMessage]
});
}
throw new PromptCachingMiddlewareError("Last message content is not a string or array");
}
});
}
//#endregion
export { anthropicPromptCachingMiddleware };
//# sourceMappingURL=promptCaching.js.map