@agixbt/elizascraper
Version:
Awesome Scraper for eliza, scrape docs, tweets, and tokens
36 lines (35 loc) • 1.41 kB
TypeScript
/**
* scraper.ts
*
* exports - fetchTweetsByTimestamp, fetchPage, extractLinks, scrapeBerachainDocs
*
* fetchTweetsByTimestamp - exposed as external apis + used in the cron jobs to be converted to embeddings and stored in the database.
* - fetches recent tweets by recent timestamp and tags / authors
* - handles (we can take a list from De) basically we want folks who tweet reliable info and know what they're talking about wrt berachain
* - searchTerms - berachain launch, token wen? (basically we'll have context passed in, in case of a user asking something, and the vectorDB not having a contextual enough answer)
* - Start / End date - we can have a cron job that runs every 2 hours and fetches tweets from the last 2 hours (although the first timem it gets all best tweets with most engagement, from the past ~24 hours)
* - Engagement(minimumReplies, minimumRetweets) - to filter out spam
* returns Tweet[]
*
*/
export interface SubSection {
title: string;
content: string;
}
export interface DocSection {
topic: string;
url: string;
overview: string;
subsections: SubSection[];
}
export declare function siteScraper(baseUrl: string): Promise<{
title: string;
last_updated: string;
total_sections: number;
sections: {
topic: string;
source_url: string;
overview: string;
subsections: SubSection[];
}[];
}>;