UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

306 lines (264 loc) 8.08 kB
import Parser from "rss-parser"; import { BaseDocumentLoader } from "./base"; import { Document, DocumentMetadata } from "../types"; export interface RSSLoaderConfig { maxItems?: number; includeContent?: boolean; requestOptions?: { timeout?: number; headers?: Record<string, string>; }; } export interface RSSItem { title?: string; link?: string; content?: string; contentSnippet?: string; author?: string; pubDate?: string; categories?: string[]; guid?: string; } export interface RSSFeed { title?: string; description?: string; link?: string; language?: string; lastBuildDate?: string; items: RSSItem[]; } export class RSSLoader extends BaseDocumentLoader { supportedExtensions: string[] = ["rss", "xml"]; private parser: Parser; private config: RSSLoaderConfig; constructor(config: RSSLoaderConfig = {}) { super(); this.config = { maxItems: 50, includeContent: true, requestOptions: { timeout: 10000, headers: { "User-Agent": "Mozilla/5.0 (compatible; RAG-System/1.0)", }, }, ...config, }; this.parser = new Parser({ timeout: this.config.requestOptions?.timeout, headers: this.config.requestOptions?.headers, customFields: { item: [ ["content:encoded", "contentEncoded"], ["description", "description"], ], }, }); } async load(filePath: string, content: Buffer): Promise<Document> { // For RSS files, we load the first document from the feed const documents = await this.loadFromString( content.toString("utf-8"), filePath ); if (documents.length === 0) { throw new Error(`No documents found in RSS feed: ${filePath}`); } return documents[0]; } async loadFromURL(url: string): Promise<Document[]> { try { const feed = await this.parser.parseURL(url); return this.processFeed(feed, url); } catch (error) { throw new Error(`Failed to load RSS feed from ${url}: ${error}`); } } async loadFromString( rssContent: string, sourceUrl: string ): Promise<Document[]> { try { const feed = await this.parser.parseString(rssContent); return this.processFeed(feed, sourceUrl); } catch (error) { throw new Error(`Failed to parse RSS content: ${error}`); } } private processFeed(feed: any, sourceUrl: string): Document[] { const documents: Document[] = []; const items = this.config.maxItems ? feed.items.slice(0, this.config.maxItems) : feed.items; for (const item of items) { const document = this.createDocumentFromItem(item, feed, sourceUrl); if (document) { documents.push(document); } } return documents; } private createDocumentFromItem( item: any, feed: any, sourceUrl: string ): Document | null { const title = item.title || "Untitled"; const link = item.link || ""; // Extract content from various possible fields let content = ""; if (this.config.includeContent) { content = item.contentEncoded || item.content || item.description || item.contentSnippet || ""; } else { content = item.contentSnippet || item.description || ""; } if (!content.trim()) { return null; // Skip items without content } // Clean HTML tags from content if present const cleanContent = this.cleanHtmlContent(content); const fullContent = `${title}\n\n${cleanContent}`; const metadata: DocumentMetadata = { title, author: item.author || feed.title || "Unknown", createdAt: item.pubDate ? new Date(item.pubDate) : new Date(), updatedAt: new Date(), fileType: "rss", fileSize: fullContent.length, language: this.detectLanguage(fullContent), tags: item.categories || [], categories: item.categories || [], description: item.contentSnippet || item.description || "", }; // Add RSS-specific metadata (metadata as any).rssSource = { feedUrl: sourceUrl, feedTitle: feed.title, itemLink: link, pubDate: item.pubDate, guid: item.guid, }; return { id: this.generateDocumentId(link || item.guid || title), content: fullContent, metadata, source: link || sourceUrl, }; } private cleanHtmlContent(html: string): string { // Remove HTML tags and decode HTML entities return html .replace(/<[^>]*>/g, "") // Remove HTML tags .replace(/&nbsp;/g, " ") .replace(/&amp;/g, "&") .replace(/&lt;/g, "<") .replace(/&gt;/g, ">") .replace(/&quot;/g, '"') .replace(/&#39;/g, "'") .replace(/\s+/g, " ") // Normalize whitespace .trim(); } private detectLanguage(content: string): string { // Simple language detection - Korean vs English const koreanPattern = /[ㄱ-ㅎ가-힣]/; return koreanPattern.test(content) ? "ko" : "en"; } protected generateDocumentId(identifier: string): string { // Create a stable ID from the identifier const timestamp = Date.now(); const hash = identifier.split("").reduce((a, b) => { a = (a << 5) - a + b.charCodeAt(0); return a & a; }, 0); return `rss_${Math.abs(hash)}_${timestamp}`; } } // Specialized loader for Naver Blog RSS export class NaverBlogRSSLoader extends RSSLoader { private blogId: string; constructor(blogId: string, config: RSSLoaderConfig = {}) { super(config); this.blogId = blogId; } async loadBlog(): Promise<Document[]> { const rssUrl = `https://rss.blog.naver.com/${this.blogId}`; return this.loadFromURL(rssUrl); } getBlogId(): string { return this.blogId; } getRSSUrl(): string { return `https://rss.blog.naver.com/${this.blogId}`; } static createFromUrl( url: string, config: RSSLoaderConfig = {} ): NaverBlogRSSLoader { // Extract blog ID from various URL formats const blogId = this.extractBlogId(url); if (!blogId) { throw new Error(`Cannot extract blog ID from URL: ${url}`); } return new NaverBlogRSSLoader(blogId, config); } static extractBlogId(url: string): string | null { // Handle various Naver blog URL formats const patterns = [ /rss\.blog\.naver\.com\/([^\/\?]+)/, // RSS URL /blog\.naver\.com\/([^\/\?]+)/, // Blog URL /([^\/\?]+)\.blog\.me/, // blog.me URL ]; for (const pattern of patterns) { const match = url.match(pattern); if (match && match[1]) { return match[1]; } } // If no pattern matches, assume the URL itself is the blog ID const cleanUrl = url.replace(/^https?:\/\//, "").replace(/\/$/, ""); if (cleanUrl && !cleanUrl.includes("/") && !cleanUrl.includes(".")) { return cleanUrl; } return null; } } // Generic RSS feed manager for multiple sources export class RSSFeedManager { private feedSources: Map<string, RSSLoader> = new Map(); addFeed(name: string, loader: RSSLoader): void { this.feedSources.set(name, loader); } removeFeed(name: string): boolean { return this.feedSources.delete(name); } async loadAllFeeds(): Promise<Map<string, Document[]>> { const results = new Map<string, Document[]>(); const promises = Array.from(this.feedSources.entries()).map( async ([name, loader]) => { try { const documents = loader instanceof NaverBlogRSSLoader ? await loader.loadBlog() : await loader.loadFromURL(""); // This would need to be configured results.set(name, documents); } catch (error) { console.error(`Failed to load feed ${name}:`, error); results.set(name, []); } } ); await Promise.all(promises); return results; } getFeedNames(): string[] { return Array.from(this.feedSources.keys()); } getFeed(name: string): RSSLoader | undefined { return this.feedSources.get(name); } }