@promptbook/azure-openai
Version:
Promptbook: Run AI apps in plain human language across multiple models and platforms
51 lines (50 loc) • 2.39 kB
TypeScript
import type { KnowledgePiecePreparedJson } from '../../pipeline/PipelineJson/KnowledgePieceJson';
import type { string_markdown } from '../../types/typeAliases';
import type { Converter } from '../_common/Converter';
import type { Scraper } from '../_common/Scraper';
import type { ScraperSourceHandler } from '../_common/Scraper';
import type { ExecutionTools } from '../../execution/ExecutionTools';
import type { PrepareAndScrapeOptions } from '../../prepare/PrepareAndScrapeOptions';
import type { ScraperAndConverterMetadata } from '../_common/register/ScraperAndConverterMetadata';
import type { ScraperIntermediateSource } from '../_common/ScraperIntermediateSource';
/**
* Scraper for websites
*
* @see `documentationUrl` for more details
* @public exported from `@promptbook/website-crawler`
*/
export declare class WebsiteScraper implements Converter, Scraper {
private readonly tools;
private readonly options;
/**
* Metadata of the scraper which includes title, mime types, etc.
*/
get metadata(): ScraperAndConverterMetadata;
/**
* Markdown scraper is used internally
*/
private readonly markdownScraper;
/**
* Showdown converter is used internally
*/
private readonly showdownConverter;
constructor(tools: Pick<ExecutionTools, 'fs' | 'llm'>, options: PrepareAndScrapeOptions);
/**
* Convert the website to `.md` file and returns intermediate source
*
* Note: `$` is used to indicate that this function is not a pure function - it leaves files on the disk and you are responsible for cleaning them by calling `destroy` method of returned object
*/
$convert(source: ScraperSourceHandler): Promise<ScraperIntermediateSource & {
markdown: string_markdown;
}>;
/**
* Scrapes the website and returns the knowledge pieces or `null` if it can't scrape it
*/
scrape(source: ScraperSourceHandler): Promise<ReadonlyArray<Omit<KnowledgePiecePreparedJson, 'sources' | 'preparationIds'>> | null>;
}
/**
* TODO: [👣] Scraped website in .md can act as cache item - there is no need to run conversion each time
* TODO: [🪂] Do it in parallel 11:11
* Note: No need to aggregate usage here, it is done by intercepting the llmTools
* Note: [🟢] Code in this file should never be never released in packages that could be imported into browser environment
*/