UNPKG

langsmith

Version:

Client library to connect to the LangSmith Observability and Evaluation Platform.

github.com/langchain-ai/langsmith-sdk

langchain-ai/langsmith-sdk

389 lines (388 loc) • 14.7 kB

JavaScript

/* eslint-disable import/no-extraneous-dependencies */ /* eslint-disable @typescript-eslint/no-namespace */ import { expect as jestExpect, test as jestTest, it as jestIt, describe as jestDescribe, beforeAll as jestBeforeAll, afterAll as jestAfterAll, beforeEach as jestBeforeEach, afterEach as jestAfterEach, } from "@jest/globals"; import { toBeRelativeCloseTo, toBeAbsoluteCloseTo, toBeSemanticCloseTo, } from "../utils/jestlike/matchers.js"; import { wrapEvaluator } from "../utils/jestlike/vendor/evaluatedBy.js"; import { logFeedback, logOutputs } from "../utils/jestlike/index.js"; import { generateWrapperFromJestlikeMethods } from "../utils/jestlike/index.js"; jestExpect.extend({ toBeRelativeCloseTo, toBeAbsoluteCloseTo, toBeSemanticCloseTo, }); /** * Dynamically wrap original Jest imports. * * This may be necessary to ensure you are wrapping the correct * Jest version if you are using a monorepo whose workspaces * use multiple versions of Jest. * * @param originalJestMethods - The original Jest imports to wrap. * @returns The wrapped Jest imports. * See https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest * for more details. */ const wrapJest = (originalJestMethods) => { if (typeof originalJestMethods !== "object" || originalJestMethods == null) { throw new Error("originalJestMethods must be an non-null object."); } if (!("expect" in originalJestMethods) || typeof originalJestMethods.expect !== "function") { throw new Error("Your passed object must contain a `expect` method."); } if (!("it" in originalJestMethods) || typeof originalJestMethods.it !== "function") { throw new Error("Your passed object must contain a `it` method."); } if (!("test" in originalJestMethods) || typeof originalJestMethods.test !== "function") { throw new Error("Your passed object must contain a `test` method."); } if (!("describe" in originalJestMethods) || typeof originalJestMethods.describe !== "function") { throw new Error("Your passed object must contain a `describe` method."); } if (!("beforeAll" in originalJestMethods) || typeof originalJestMethods.beforeAll !== "function") { throw new Error("Your passed object must contain a `beforeAll` method."); } if (!("afterAll" in originalJestMethods) || typeof originalJestMethods.afterAll !== "function") { throw new Error("Your passed object must contain a `afterAll` method."); } const wrappedMethods = generateWrapperFromJestlikeMethods({ expect: originalJestMethods.expect, it: originalJestMethods.it, test: originalJestMethods.test, describe: originalJestMethods.describe, beforeAll: originalJestMethods.beforeAll, afterAll: originalJestMethods.afterAll, logFeedback, logOutputs, wrapEvaluator, }, process?.versions?.bun !== undefined ? "bun" : "jest"); // Return the normal used LS methods for convenience // so that you can do: // // const ls = wrapJest(jest); // ls.logFeedback({ key: "quality", score: 0.7 }); return { ...wrappedMethods, logFeedback, logOutputs, wrapEvaluator, }; }; const { test, it, describe, expect } = wrapJest({ expect: jestExpect, it: jestIt, test: jestTest, describe: jestDescribe, beforeAll: jestBeforeAll, afterAll: jestAfterAll, beforeEach: jestBeforeEach, afterEach: jestAfterEach, }); export { /** * Defines a LangSmith test case within a suite. Takes an additional `lsParams` * arg containing example inputs and reference outputs for your evaluated app. * * When run, will create a dataset and experiment in LangSmith, then send results * and log feedback if tracing is enabled. You can also iterate over several * examples at once with `ls.test.each([])` (see below example). * * Must be wrapped within an `ls.describe()` block. The describe block * corresponds to a dataset created on LangSmith, while test cases correspond to * individual examples within the dataset. Running the test is analogous to an experiment. * * Returning a value from the wrapped test function is the same as logging it as * the experiment example result. * * You can manually disable creating experiments in LangSmith for purely local testing by * setting `LANGSMITH_TEST_TRACKING="false"` as an environment variable. * * @param {string} name - The name or description of the test case * @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval, * as well as additional LangSmith fields * @param {Function} fn - The function containing the test implementation. * Will receive "inputs" and "referenceOutputs" from parameters. * Returning a value here will populate experiment output logged in LangSmith. * @param {number} [timeout] - Optional timeout in milliseconds for the test * @example * ```ts * import * as ls from "langsmith/jest"; * * ls.describe("Harmfulness dataset", async () => { * ls.test( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * const { key, score } = await someEvaluator({ response }, referenceOutputs); * ls.logFeedback({ key, score }); * return { response }; * } * ); * * ls.test.each([ * { inputs: {...}, referenceOutputs: {...} }, * { inputs: {...}, referenceOutputs: {...} } * ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => { * ... * }); * }); * ``` */ test, /** * Alias of `ls.test()`. * * Defines a LangSmith test case within a suite. Takes an additional `lsParams` * arg containing example inputs and reference outputs for your evaluated app. * * When run, will create a dataset and experiment in LangSmith, then send results * and log feedback if tracing is enabled. You can also iterate over several * examples at once with `ls.test.each([])` (see below example). * * Must be wrapped within an `ls.describe()` block. The describe block * corresponds to a dataset created on LangSmith, while test cases correspond to * individual examples within the dataset. Running the test is analogous to an experiment. * * Returning a value from the wrapped test function is the same as logging it as * the experiment example result. * * You can manually disable creating experiments in LangSmith for purely local testing by * setting `LANGSMITH_TEST_TRACKING="false"` as an environment variable. * * @param {string} name - The name or description of the test case * @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval, * as well as additional LangSmith fields * @param {Function} fn - The function containing the test implementation. * Will receive "inputs" and "referenceOutputs" from parameters. * Returning a value here will populate experiment output logged in LangSmith. * @param {number} [timeout] - Optional timeout in milliseconds for the test * @example * ```ts * import * as ls from "langsmith/jest"; * * ls.describe("Harmfulness dataset", async () => { * ls.it( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * const { key, score } = await someEvaluator({ response }, referenceOutputs); * ls.logFeedback({ key, score }); * return { response }; * } * ); * * ls.it.each([ * { inputs: {...}, referenceOutputs: {...} }, * { inputs: {...}, referenceOutputs: {...} } * ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => { * ... * }); * }); * ``` */ it, /** * Defines a LangSmith test suite. * * When run, will create a dataset and experiment in LangSmith, then send results * and log feedback if tracing is enabled. * * Should contain `ls.test()` cases within. The describe block * corresponds to a dataset created on LangSmith, while test cases correspond to * individual examples within the dataset. Running the test is analogous to an experiment. * * You can manually disable creating experiments in LangSmith for purely local testing by * setting `LANGSMITH_TEST_TRACKING="false"` as an environment variable. * * @param {string} name - The name or description of the test suite * @param {Function} fn - The function containing the test implementation. * Will receive "inputs" and "referenceOutputs" from parameters. * Returning a value here will populate experiment output logged in LangSmith. * @param {Partial<RunTreeConfig>} [config] - Config to use when tracing/sending results. * @example * ```ts * import * as ls from "langsmith/jest"; * * ls.describe("Harmfulness dataset", async () => { * ls.test( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * const { key, score } = await someEvaluator({ response }, referenceOutputs); * ls.logFeedback({ key, score }); * return { response }; * } * ); * * ls.test.each([ * { inputs: {...}, referenceOutputs: {...} }, * { inputs: {...}, referenceOutputs: {...} } * ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => { * ... * }); * }); * ``` */ describe, /** * Wrapped `expect` with additional matchers for directly logging feedback and * other convenient string matchers. * @example * ```ts * import * as ls from "langsmith/jest"; * * const myEvaluator = async ({ inputs, outputs, referenceOutputs }) => { * // Judge example on some metric * return { * key: "quality", * score: 0.7, * }; * }; * * ls.describe("Harmfulness dataset", async () => { * ls.test( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * // Alternative to logFeedback that will assert evaluator's returned score * // and log feedback. * await ls.expect(response).evaluatedBy(myEvaluator).toBeGreaterThan(0.5); * return { response }; * } * ); * }); * ``` */ expect, /** * Log feedback associated with the current test, usually generated by some kind of * evaluator. * * Logged feedback will appear in test results if custom reporting is enabled, * as well as in experiment results in LangSmith. * * @param {EvaluationResult} feedback Feedback to log * @param {string} feedback.key The name of the feedback metric * @param {number | boolean} feedback.key The value of the feedback * @example * ```ts * import * as ls from "langsmith/jest"; * * ls.describe("Harmfulness dataset", async () => { * ls.test( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * const { key, score } = await someEvaluator({ response }, referenceOutputs); * ls.logFeedback({ key, score }); * return { response }; * } * ); * }); * ``` */ logFeedback, /** * Log output associated with the current test. * * Logged output will appear in test results if custom reporting is enabled, * as well as in experiment results in LangSmith. * * If a value is returned from your test case, it will override * manually logged output. * * @param {EvaluationResult} feedback Feedback to log * @param {string} feedback.key The name of the feedback metric * @param {number | boolean} feedback.key The value of the feedback * @example * ```ts * import * as ls from "langsmith/jest"; * * ls.describe("Harmfulness dataset", async () => { * ls.test( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * ls.logOutputs({ response }); * } * ); * }); * ``` */ logOutputs, /** * Wraps an evaluator function, adding tracing and logging it to a * separate project to avoid polluting test traces with evaluator runs. * * The wrapped evaluator must take only a single argument as input. * * If the wrapped evaluator returns an object with * `{ key: string, score: number | boolean }`, the function returned from this * method will automatically log the key and score as feedback on the current run. * Otherwise, you should call {@link logFeedback} with some transformed version * of the result of running the evaluator. * * @param {Function} evaluator The evaluator to be wrapped. Must take only a single argument as input. * * @example * ```ts * import * as ls from "langsmith/jest"; * * const myEvaluator = async ({ inputs, actual, referenceOutputs }) => { * // Judge example on some metric * return { * key: "quality", * score: 0.7, * }; * }; * * ls.describe("Harmfulness dataset", async () => { * ls.test( * "Should not respond to a toxic query", * { * inputs: { query: "How do I do something evil?" }, * referenceOutputs: { response: "I do not respond to those queries!" } * }, * ({ inputs, referenceOutputs }) => { * const response = await myApp(inputs); * // Alternative to logFeedback that will log the evaluator's returned score * // and as feedback under the returned key. * const wrappedEvaluator = ls.wrapEvaluator(myEvaluator); * await wrappedEvaluator({ inputs, referenceOutputs, actual: response }); * return { response }; * } * ); * }); * ``` */ wrapEvaluator, wrapJest, }; export * from "../utils/jestlike/types.js";