UNPKG

@langchain/community

Version:
1 lines 9.41 kB
{"version":3,"file":"apify_dataset.cjs","names":["BaseDocumentLoader","AsyncCaller","ApifyClient"],"sources":["../../../src/document_loaders/web/apify_dataset.ts"],"sourcesContent":["/* oxlint-disable typescript/no-explicit-any */\n\nimport {\n ActorCallOptions,\n ApifyClient,\n ApifyClientOptions,\n TaskCallOptions,\n} from \"apify-client\";\n\nimport { Document } from \"@langchain/core/documents\";\nimport {\n AsyncCaller,\n AsyncCallerParams,\n} from \"@langchain/core/utils/async_caller\";\nimport { getEnvironmentVariable } from \"@langchain/core/utils/env\";\nimport {\n BaseDocumentLoader,\n DocumentLoader,\n} from \"@langchain/core/document_loaders/base\";\n\n/**\n * A type that represents a function that takes a single object (an Apify\n * dataset item) and converts it to an instance of the Document class.\n *\n * Change function signature to only be asynchronous for simplicity in v0.1.0\n * https://github.com/langchain-ai/langchainjs/pull/3262\n */\nexport type ApifyDatasetMappingFunction<Metadata extends Record<string, any>> =\n (\n item: Record<string | number, unknown>\n ) =>\n | Document<Metadata>\n | Array<Document<Metadata>>\n | Promise<Document<Metadata> | Array<Document<Metadata>>>;\n\nexport interface ApifyDatasetLoaderConfig<\n Metadata extends Record<string, any>,\n> extends AsyncCallerParams {\n datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;\n clientOptions?: ApifyClientOptions;\n}\n\n/**\n * A class that extends the BaseDocumentLoader and implements the\n * DocumentLoader interface. It represents a document loader that loads\n * documents from an Apify dataset.\n * @example\n * ```typescript\n * const loader = new ApifyDatasetLoader(\"your-dataset-id\", {\n * datasetMappingFunction: (item) =>\n * new Document({\n * pageContent: item.text || \"\",\n * metadata: { source: item.url },\n * }),\n * clientOptions: {\n * token: \"your-apify-token\",\n * },\n * });\n *\n * const docs = await loader.load();\n *\n * const chain = new RetrievalQAChain();\n * const res = await chain.invoke({ query: \"What is LangChain?\" });\n *\n * console.log(res.text);\n * console.log(res.sourceDocuments.map((d) => d.metadata.source));\n * ```\n */\nexport class ApifyDatasetLoader<Metadata extends Record<string, any>>\n extends BaseDocumentLoader\n implements DocumentLoader\n{\n protected apifyClient: ApifyClient;\n\n protected datasetId: string;\n\n protected datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;\n\n protected caller: AsyncCaller;\n\n constructor(datasetId: string, config: ApifyDatasetLoaderConfig<Metadata>) {\n super();\n const { clientOptions, datasetMappingFunction, ...asyncCallerParams } =\n config;\n this.apifyClient = ApifyDatasetLoader._getApifyClient(clientOptions);\n this.datasetId = datasetId;\n this.datasetMappingFunction = datasetMappingFunction;\n this.caller = new AsyncCaller(asyncCallerParams);\n }\n\n /**\n * Creates an instance of the ApifyClient class with the provided clientOptions.\n * Adds a User-Agent header to the request config for langchainjs attribution.\n * @param clientOptions\n * @private\n */\n private static _getApifyClient(\n clientOptions?: ApifyClientOptions\n ): ApifyClient {\n const token = ApifyDatasetLoader._getApifyApiToken(clientOptions);\n const updatedClientOptions = {\n ...clientOptions,\n token,\n requestInterceptors: [\n ...(clientOptions?.requestInterceptors ?? []),\n ApifyDatasetLoader._addUserAgent,\n ],\n };\n return new ApifyClient({ ...updatedClientOptions, token });\n }\n\n private static _getApifyApiToken(config?: { token?: string }) {\n return config?.token ?? getEnvironmentVariable(\"APIFY_API_TOKEN\");\n }\n\n /**\n * Adds a User-Agent header to the request config.\n * @param config\n * @private\n */\n private static _addUserAgent(config: any): any {\n const updatedConfig = { ...config };\n updatedConfig.headers ??= {};\n updatedConfig.headers[\"User-Agent\"] = `${\n updatedConfig.headers[\"User-Agent\"] ?? \"\"\n }; Origin/langchainjs`;\n return updatedConfig;\n }\n\n /**\n * Retrieves the dataset items from the Apify platform and applies the\n * datasetMappingFunction to each item to create an array of Document\n * instances.\n * @returns An array of Document instances.\n */\n async load(): Promise<Document<Metadata>[]> {\n const dataset = await this.apifyClient\n .dataset(this.datasetId)\n .listItems({ clean: true });\n\n const documentList = await Promise.all(\n dataset.items.map((item) =>\n this.caller.call(async () => this.datasetMappingFunction(item))\n )\n );\n\n return documentList.flat();\n }\n\n /**\n * Create an ApifyDatasetLoader by calling an Actor on the Apify platform and waiting for its results to be ready.\n * @param actorId The ID or name of the Actor on the Apify platform.\n * @param input The input object of the Actor that you're trying to run.\n * @param config Options specifying settings for the Actor run.\n * @param config.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.\n * @returns An instance of `ApifyDatasetLoader` with the results from the Actor run.\n */\n static async fromActorCall<Metadata extends Record<string, any>>(\n actorId: string,\n input: Record<string | number, unknown>,\n config: {\n callOptions?: ActorCallOptions;\n clientOptions?: ApifyClientOptions;\n datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;\n }\n ): Promise<ApifyDatasetLoader<Metadata>> {\n const apifyApiToken = ApifyDatasetLoader._getApifyApiToken(\n config.clientOptions\n );\n const apifyClient = ApifyDatasetLoader._getApifyClient(\n config.clientOptions\n );\n const actorCall = await apifyClient\n .actor(actorId)\n .call(input, config.callOptions ?? {});\n\n return new ApifyDatasetLoader(actorCall.defaultDatasetId, {\n datasetMappingFunction: config.datasetMappingFunction,\n clientOptions: { ...config.clientOptions, token: apifyApiToken },\n });\n }\n\n /**\n * Create an ApifyDatasetLoader by calling a saved Actor task on the Apify platform and waiting for its results to be ready.\n * @param taskId The ID or name of the task on the Apify platform.\n * @param input The input object of the task that you're trying to run. Overrides the task's saved input.\n * @param config Options specifying settings for the task run.\n * @param config.callOptions Options specifying settings for the task run.\n * @param config.clientOptions Options specifying settings for the Apify client.\n * @param config.datasetMappingFunction A function that takes a single object (an Apify dataset item) and converts it to an instance of the Document class.\n * @returns An instance of `ApifyDatasetLoader` with the results from the task's run.\n */\n static async fromActorTaskCall<Metadata extends Record<string, any>>(\n taskId: string,\n input: Record<string | number, unknown>,\n config: {\n callOptions?: TaskCallOptions;\n clientOptions?: ApifyClientOptions;\n datasetMappingFunction: ApifyDatasetMappingFunction<Metadata>;\n }\n ): Promise<ApifyDatasetLoader<Metadata>> {\n const apifyApiToken = ApifyDatasetLoader._getApifyApiToken(\n config.clientOptions\n );\n const apifyClient = ApifyDatasetLoader._getApifyClient(\n config.clientOptions\n );\n const taskCall = await apifyClient\n .task(taskId)\n .call(input, config.callOptions ?? {});\n\n return new ApifyDatasetLoader(taskCall.defaultDatasetId, {\n datasetMappingFunction: config.datasetMappingFunction,\n clientOptions: { ...config.clientOptions, token: apifyApiToken },\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAoEA,IAAa,qBAAb,MAAa,2BACHA,sCAAAA,mBAEV;CACE;CAEA;CAEA;CAEA;CAEA,YAAY,WAAmB,QAA4C;AACzE,SAAO;EACP,MAAM,EAAE,eAAe,wBAAwB,GAAG,sBAChD;AACF,OAAK,cAAc,mBAAmB,gBAAgB,cAAc;AACpE,OAAK,YAAY;AACjB,OAAK,yBAAyB;AAC9B,OAAK,SAAS,IAAIC,mCAAAA,YAAY,kBAAkB;;;;;;;;CASlD,OAAe,gBACb,eACa;EACb,MAAM,QAAQ,mBAAmB,kBAAkB,cAAc;AASjE,SAAO,IAAIC,aAAAA,YAAY;GAPrB,GAAG;GACH;GACA,qBAAqB,CACnB,GAAI,eAAe,uBAAuB,EAAE,EAC5C,mBAAmB,cACpB;GAE+C;GAAO,CAAC;;CAG5D,OAAe,kBAAkB,QAA6B;AAC5D,SAAO,QAAQ,UAAA,GAAA,0BAAA,wBAAgC,kBAAkB;;;;;;;CAQnE,OAAe,cAAc,QAAkB;EAC7C,MAAM,gBAAgB,EAAE,GAAG,QAAQ;AACnC,gBAAc,YAAY,EAAE;AAC5B,gBAAc,QAAQ,gBAAgB,GACpC,cAAc,QAAQ,iBAAiB,GACxC;AACD,SAAO;;;;;;;;CAST,MAAM,OAAsC;EAC1C,MAAM,UAAU,MAAM,KAAK,YACxB,QAAQ,KAAK,UAAU,CACvB,UAAU,EAAE,OAAO,MAAM,CAAC;AAQ7B,UANqB,MAAM,QAAQ,IACjC,QAAQ,MAAM,KAAK,SACjB,KAAK,OAAO,KAAK,YAAY,KAAK,uBAAuB,KAAK,CAAC,CAChE,CACF,EAEmB,MAAM;;;;;;;;;;CAW5B,aAAa,cACX,SACA,OACA,QAKuC;EACvC,MAAM,gBAAgB,mBAAmB,kBACvC,OAAO,cACR;AAQD,SAAO,IAAI,oBAJO,MAHE,mBAAmB,gBACrC,OAAO,cACR,CAEE,MAAM,QAAQ,CACd,KAAK,OAAO,OAAO,eAAe,EAAE,CAAC,EAEA,kBAAkB;GACxD,wBAAwB,OAAO;GAC/B,eAAe;IAAE,GAAG,OAAO;IAAe,OAAO;IAAe;GACjE,CAAC;;;;;;;;;;;;CAaJ,aAAa,kBACX,QACA,OACA,QAKuC;EACvC,MAAM,gBAAgB,mBAAmB,kBACvC,OAAO,cACR;AAQD,SAAO,IAAI,oBAJM,MAHG,mBAAmB,gBACrC,OAAO,cACR,CAEE,KAAK,OAAO,CACZ,KAAK,OAAO,OAAO,eAAe,EAAE,CAAC,EAED,kBAAkB;GACvD,wBAAwB,OAAO;GAC/B,eAAe;IAAE,GAAG,OAAO;IAAe,OAAO;IAAe;GACjE,CAAC"}