@agixbt/elizascraper

Version:

Awesome Scraper for eliza, scrape docs, tweets, and tokens

36 lines (35 loc) • 1.41 kB

TypeScript

/** * scraper.ts * * exports - fetchTweetsByTimestamp, fetchPage, extractLinks, scrapeBerachainDocs * * fetchTweetsByTimestamp - exposed as external apis + used in the cron jobs to be converted to embeddings and stored in the database. * - fetches recent tweets by recent timestamp and tags / authors * - handles (we can take a list from De) basically we want folks who tweet reliable info and know what they're talking about wrt berachain * - searchTerms - berachain launch, token wen? (basically we'll have context passed in, in case of a user asking something, and the vectorDB not having a contextual enough answer) * - Start / End date - we can have a cron job that runs every 2 hours and fetches tweets from the last 2 hours (although the first timem it gets all best tweets with most engagement, from the past ~24 hours) * - Engagement(minimumReplies, minimumRetweets) - to filter out spam * returns Tweet[] * */ export interface SubSection { title: string; content: string; } export interface DocSection { topic: string; url: string; overview: string; subsections: SubSection[]; } export declare function siteScraper(baseUrl: string): Promise<{ title: string; last_updated: string; total_sections: number; sections: { topic: string; source_url: string; overview: string; subsections: SubSection[]; }[]; }>;