neuronpedia

Version:

Neuronpedia (Official)

1 lines • 14.1 kB

Source Map (JSON)

{"version":3,"sources":["../src/autointerp/score.ts","../src/types/autointerp/score/embedding.ts","../src/types/autointerp/score/fuzz_detection.ts","../src/const.ts","../src/autointerp/explain.ts","../src/types/autointerp/explain/default.ts","../src/types/webapp/api/sae_eval.ts"],"sourcesContent":["import axios from \"axios\";\nimport { postScoreEmbedding } from \"../types/autointerp/score/embedding\";\nimport { postScoreFuzzDetection } from \"../types/autointerp/score/fuzz_detection\";\nimport type { NPActivation } from \"../types/common/activation\";\nimport { AUTOINTERP_SERVER } from \"../const\";\n\nexport async function getEleutherScoreFuzzDetection(\n type: \"fuzz\" | \"detection\",\n activations: NPActivation[],\n explanation: string,\n openRouterKey: string,\n openRouterModel: string,\n inferenceServerSecret: string\n) {\n axios.defaults.baseURL = AUTOINTERP_SERVER;\n const result = await postScoreFuzzDetection({\n type: type,\n activations: activations,\n explanation: explanation,\n openrouter_key: openRouterKey,\n model: openRouterModel,\n secret: inferenceServerSecret,\n });\n if (result.status !== 200 || !result.data) {\n throw new Error(\n \"Failed to generate score: \" + JSON.stringify(result.statusText)\n );\n }\n return result.data;\n}\n\nexport async function getEleutherScoreEmbedding(\n activations: NPActivation[],\n explanation: string,\n inferenceServerSecret: string\n) {\n axios.defaults.baseURL = AUTOINTERP_SERVER;\n const result = await postScoreEmbedding({\n activations: activations,\n explanation: explanation,\n secret: inferenceServerSecret,\n });\n if (result.status !== 200 || !result.data) {\n throw new Error(\n \"Failed to generate score: \" + JSON.stringify(result.statusText)\n );\n }\n return result.data;\n}\n","/**\n * Generated by orval v7.4.1 🍺\n * Do not edit manually.\n * Auto-Interp Server API\n * API for generating and scoring explanations of neuron/feature behavior\n * OpenAPI spec version: 1.0.0\n */\nimport axios from 'axios'\nimport type {\n AxiosRequestConfig,\n AxiosResponse\n} from 'axios'\n/**\n * An activation record containing tokens and their corresponding activation values\n */\nexport interface NPActivation {\n /** List of tokens for this text */\n tokens: string[];\n /** Activation values corresponding to each token */\n values: number[];\n}\n\nexport interface NPScoreEmbeddingResponse {\n /** The score from 0 to 1 */\n score: number;\n /** Detailed breakdown of the embedding outputs */\n breakdown: NPScoreEmbeddingOutput[];\n}\n\n/**\n * Request model for scoring explanations using embedding similarity\n */\nexport interface NPScoreEmbeddingRequest {\n /** List of activation records to analyze */\n activations: NPActivation[];\n /** The explanation to evaluate */\n explanation: string;\n /** Authentication secret for the API */\n secret: string;\n}\n\n/**\n * Quantile or neighbor distance\n */\nexport type NPScoreEmbeddingOutputDistance = number | number;\n\n/**\n * The \"scorer.__call__\" result's score breakdown. With exception of fixing similarity to change to number instead of array of number, type is copied from https://github.com/EleutherAI/sae-auto-interp/blob/3659ff3bfefbe2628d37484e5bcc0087a5b10a27/sae_auto_interp/scorers/embedding/embedding.py#L20\n */\nexport interface NPScoreEmbeddingOutput {\n /** The text that was used to evaluate the similarity */\n text: string;\n /** Quantile or neighbor distance */\n distance: NPScoreEmbeddingOutputDistance;\n /** What is the similarity of the example to the explanation */\n similarity: number;\n}\n\n\n\n\n\n /**\n * @summary Score an explanation using embedding similarity, using the dunzhang/stella_en_400M_v5 model.\n */\nexport const postScoreEmbedding = <TData = AxiosResponse<NPScoreEmbeddingResponse>>(\n nPScoreEmbeddingRequest: NPScoreEmbeddingRequest, options?: AxiosRequestConfig\n ): Promise<TData> => {\n return axios.post(\n `/score/embedding`,\n nPScoreEmbeddingRequest,options\n );\n }\n\nexport type PostScoreEmbeddingResult = AxiosResponse<NPScoreEmbeddingResponse>\n","/**\n * Generated by orval v7.4.1 🍺\n * Do not edit manually.\n * Auto-Interp Server API\n * API for generating and scoring explanations of neuron/feature behavior\n * OpenAPI spec version: 1.0.0\n */\nimport axios from 'axios'\nimport type {\n AxiosRequestConfig,\n AxiosResponse\n} from 'axios'\n/**\n * An activation record containing tokens and their corresponding activation values\n */\nexport interface NPActivation {\n /** List of tokens for this text */\n tokens: string[];\n /** Activation values corresponding to each token */\n values: number[];\n}\n\n/**\n * Request model for scoring explanations using fuzzing or detection methods\n */\nexport interface NPScoreFuzzDetectionRequest {\n /** List of activation records to analyze */\n activations: NPActivation[];\n /** The explanation to evaluate */\n explanation: string;\n /** API key for OpenRouter service */\n openrouter_key: string;\n /** Model identifier to use for scoring */\n model: string;\n /** Type of scoring to perform - either fuzzing or detection */\n type: NPScoreFuzzDetectionType;\n /** Authentication secret for the API */\n secret: string;\n}\n\n/**\n * Quantile or neighbor distance\n */\nexport type NPScoreClassifierOutputDistance = number | number;\n\n/**\n * The \"scorer.__call__\" result's score breakdown. Type copied from https://github.com/EleutherAI/sae-auto-interp/blob/3659ff3bfefbe2628d37484e5bcc0087a5b10a27/sae_auto_interp/scorers/classifier/sample.py#L19\n */\nexport interface NPScoreClassifierOutput {\n /** List of strings */\n str_tokens?: string[];\n /** List of floats */\n activations?: number[];\n /** Quantile or neighbor distance */\n distance?: NPScoreClassifierOutputDistance;\n /** Whether the example is activating or not */\n ground_truth?: boolean;\n /** Whether the model predicted the example activating or not */\n prediction?: boolean;\n /** Whether the sample is highlighted */\n highlighted?: boolean;\n /** The probability of the example activating */\n probability?: number;\n /** Whether the prediction is correct */\n correct?: boolean;\n}\n\nexport interface NPScoreFuzzDetectionResponse {\n /** The score from 0 to 1 */\n score: number;\n /** Detailed breakdown of the classification outputs */\n breakdown: NPScoreClassifierOutput[];\n}\n\n/**\n * Type of scoring method, either fuzz or detection\n */\nexport type NPScoreFuzzDetectionType = typeof NPScoreFuzzDetectionType[keyof typeof NPScoreFuzzDetectionType];\n\n\n// eslint-disable-next-line @typescript-eslint/no-redeclare\nexport const NPScoreFuzzDetectionType = {\n fuzz: 'fuzz',\n detection: 'detection',\n} as const;\n\n\n\n\n\n /**\n * @summary Score an explanation using fuzzing or detection methods\n */\nexport const postScoreFuzzDetection = <TData = AxiosResponse<NPScoreFuzzDetectionResponse>>(\n nPScoreFuzzDetectionRequest: NPScoreFuzzDetectionRequest, options?: AxiosRequestConfig\n ): Promise<TData> => {\n return axios.post(\n `/score/fuzz_detection`,\n nPScoreFuzzDetectionRequest,options\n );\n }\n\nexport type PostScoreFuzzDetectionResult = AxiosResponse<NPScoreFuzzDetectionResponse>\n","// these should be defined in the environment of the app importing this package\nconst USE_LOCAL_AUTOINTERP = process.env.USE_LOCAL_AUTOINTERP === \"true\";\nconst AUTOINTERP_SERVER = USE_LOCAL_AUTOINTERP\n ? \"http://127.0.0.1:5003\"\n : process.env.AUTOINTERP_SERVER;\n\nexport { AUTOINTERP_SERVER };\n","import axios from \"axios\";\nimport { postExplainDefault } from \"../types/autointerp/explain/default\";\nimport type { NPActivation } from \"../types/common/activation\";\nimport { AUTOINTERP_SERVER } from \"../const\";\n\nexport async function getEleutherExplanationDefault(\n activations: NPActivation[],\n openRouterKey: string,\n openRouterModel: string,\n inferenceServerSecret: string\n) {\n axios.defaults.baseURL = AUTOINTERP_SERVER;\n const result = await postExplainDefault({\n activations: activations,\n openrouter_key: openRouterKey,\n model: openRouterModel,\n secret: inferenceServerSecret,\n });\n\n if (result.status !== 200 || !result.data) {\n throw new Error(\n \"Failed to generate explanation: \" + JSON.stringify(result.statusText)\n );\n }\n return result.data;\n}\n","/**\n * Generated by orval v7.4.1 🍺\n * Do not edit manually.\n * Auto-Interp Server API - Explanation Generation\n * Generating an explanation via OpenRouter using the default method\n * OpenAPI spec version: 1.0.0\n */\nimport axios from 'axios'\nimport type {\n AxiosRequestConfig,\n AxiosResponse\n} from 'axios'\n/**\n * An activation record containing tokens and their corresponding activation values\n */\nexport interface NPActivation {\n /** List of tokens for this text */\n tokens: string[];\n /** Activation values corresponding to each token */\n values: number[];\n}\n\nexport interface NPExplainDefaultResponse {\n /** The generated explanation for the given set of activations */\n explanation: string;\n}\n\n/**\n * Request model for generating explanations of neuron/feature behavior\n */\nexport interface NPExplainDefaultRequest {\n /** List of activation records to analyze */\n activations: NPActivation[];\n /** API key for OpenRouter service */\n openrouter_key: string;\n /** Model identifier to use for explanation generation */\n model: string;\n /** Authentication secret for the API */\n secret: string;\n}\n\n\n\n\n\n /**\n * @summary Generate an explanation for neuron/feature behavior using the default explainer\n */\nexport const postExplainDefault = <TData = AxiosResponse<NPExplainDefaultResponse>>(\n nPExplainDefaultRequest: NPExplainDefaultRequest, options?: AxiosRequestConfig\n ): Promise<TData> => {\n return axios.post(\n `/explain/default`,\n nPExplainDefaultRequest,options\n );\n }\n\nexport type PostExplainDefaultResult = AxiosResponse<NPExplainDefaultResponse>\n","/**\n * Generated by orval v7.4.1 🍺\n * Do not edit manually.\n * Fetch SAE eval results\n * OpenAPI spec version: 1.0.0\n */\nimport axios from 'axios'\nimport type {\n AxiosRequestConfig,\n AxiosResponse\n} from 'axios'\nexport interface NPSaeEvalListResponse {\n /** List of SAE eval types and their results */\n evalTypes?: NPSaeEvalType[];\n}\n\nexport interface NPSaeEvalListRequest { [key: string]: unknown }\n\n/**\n * An SAE eval result containing the outputs (metrics, config, etc) of the eval\n */\nexport interface NPSaeEval {\n /** ID of the SAE Eval */\n id: string;\n /** SAE Eval Type - should equal the \"name\" property of NPSaeEvalType */\n typeName: string;\n /** The model ID of the SAE being evaluated */\n modelId: string;\n /** The source ID of the SAE being evaluated */\n sourceId: string;\n /** The output data, matching the outputSchema of the NPSaeEvalType, with the exception of eval_result_details, which will go under detailedMetrics */\n output: string;\n /** The eval_result_details of the output data. Kept in a separate field/column due to large size. */\n detailedMetrics?: string;\n}\n\n/**\n * A type of SAE Eval (eg absorption, sparse probing, etc). Can contain an array of the actual eval results.\n */\nexport interface NPSaeEvalType {\n /** ID of the Eval Type */\n name: string;\n /** A UI-friendly display name of the eval type */\n displayName: string;\n /** A brief description of the eval */\n description: string;\n /** JSON output schema used to parse the eval output results. From the SAEBench repo - for example https://github.com/adamkarvonen/SAEBench/blob/main/sae_bench/evals/autointerp/eval_output_schema_autointerp.json */\n outputSchema: string;\n /** URL to the eval */\n url?: string;\n /** The actual eval results of this type */\n evals?: NPSaeEval[];\n}\n\n\n\n\n\n /**\n * @summary List all SAE Evals\n */\nexport const postApiSaeEval = <TData = AxiosResponse<NPSaeEvalListResponse>>(\n nPSaeEvalListRequest: NPSaeEvalListRequest, options?: AxiosRequestConfig\n ): Promise<TData> => {\n return axios.post(\n `/api/sae-eval`,\n nPSaeEvalListRequest,options\n );\n }\n\nexport type PostApiSaeEvalResult = AxiosResponse<NPSaeEvalListResponse>\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;AAAA,OAAOA,YAAW;;;ACOlB,OAAO,WAAW;AA0DX,IAAM,qBAAqB,CAC9B,yBAAkD,YAChC;AAClB,SAAO,MAAM;AAAA,IACX;AAAA,IACA;AAAA,IAAwB;AAAA,EAC1B;AACF;;;ACjEF,OAAOC,YAAW;AA0EX,IAAM,2BAA2B;AAAA,EACtC,MAAM;AAAA,EACN,WAAW;AACb;AASO,IAAM,yBAAyB,CAClC,6BAA0D,YACxC;AAClB,SAAOA,OAAM;AAAA,IACX;AAAA,IACA;AAAA,IAA4B;AAAA,EAC9B;AACF;;;ACnGF,IAAM,uBAAuB,QAAQ,IAAI,yBAAyB;AAClE,IAAM,oBAAoB,uBACtB,0BACA,QAAQ,IAAI;;;AHEhB,SAAsB,8BACpB,MACA,aACA,aACA,eACA,iBACA,uBACA;AAAA;AACA,IAAAC,OAAM,SAAS,UAAU;AACzB,UAAM,SAAS,MAAM,uBAAuB;AAAA,MAC1C;AAAA,MACA;AAAA,MACA;AAAA,MACA,gBAAgB;AAAA,MAChB,OAAO;AAAA,MACP,QAAQ;AAAA,IACV,CAAC;AACD,QAAI,OAAO,WAAW,OAAO,CAAC,OAAO,MAAM;AACzC,YAAM,IAAI;AAAA,QACR,+BAA+B,KAAK,UAAU,OAAO,UAAU;AAAA,MACjE;AAAA,IACF;AACA,WAAO,OAAO;AAAA,EAChB;AAAA;AAEA,SAAsB,0BACpB,aACA,aACA,uBACA;AAAA;AACA,IAAAA,OAAM,SAAS,UAAU;AACzB,UAAM,SAAS,MAAM,mBAAmB;AAAA,MACtC;AAAA,MACA;AAAA,MACA,QAAQ;AAAA,IACV,CAAC;AACD,QAAI,OAAO,WAAW,OAAO,CAAC,OAAO,MAAM;AACzC,YAAM,IAAI;AAAA,QACR,+BAA+B,KAAK,UAAU,OAAO,UAAU;AAAA,MACjE;AAAA,IACF;AACA,WAAO,OAAO;AAAA,EAChB;AAAA;;;AIhDA,OAAOC,YAAW;;;ACOlB,OAAOC,YAAW;AAyCX,IAAM,qBAAqB,CAC9B,yBAAkD,YAChC;AAClB,SAAOA,OAAM;AAAA,IACX;AAAA,IACA;AAAA,IAAwB;AAAA,EAC1B;AACF;;;ADlDF,SAAsB,8BACpB,aACA,eACA,iBACA,uBACA;AAAA;AACA,IAAAC,OAAM,SAAS,UAAU;AACzB,UAAM,SAAS,MAAM,mBAAmB;AAAA,MACtC;AAAA,MACA,gBAAgB;AAAA,MAChB,OAAO;AAAA,MACP,QAAQ;AAAA,IACV,CAAC;AAED,QAAI,OAAO,WAAW,OAAO,CAAC,OAAO,MAAM;AACzC,YAAM,IAAI;AAAA,QACR,qCAAqC,KAAK,UAAU,OAAO,UAAU;AAAA,MACvE;AAAA,IACF;AACA,WAAO,OAAO;AAAA,EAChB;AAAA;;;AEnBA,OAAOC,YAAW;AAuDX,IAAM,iBAAiB,CAC1B,sBAA4C,YAC1B;AAClB,SAAOA,OAAM;AAAA,IACX;AAAA,IACA;AAAA,IAAqB;AAAA,EACvB;AACF;","names":["axios","axios","axios","axios","axios","axios","axios"]}