UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

230 lines (227 loc) 7.92 kB
import Parser from '../../_virtual/index3.js'; import { BaseDocumentLoader } from './base.js'; class RSSLoader extends BaseDocumentLoader { constructor(config = {}) { var _a, _b; super(); this.supportedExtensions = ["rss", "xml"]; this.config = { maxItems: 50, includeContent: true, requestOptions: { timeout: 10000, headers: { "User-Agent": "Mozilla/5.0 (compatible; RAG-System/1.0)", }, }, ...config, }; this.parser = new Parser({ timeout: (_a = this.config.requestOptions) === null || _a === void 0 ? void 0 : _a.timeout, headers: (_b = this.config.requestOptions) === null || _b === void 0 ? void 0 : _b.headers, customFields: { item: [ ["content:encoded", "contentEncoded"], ["description", "description"], ], }, }); } async load(filePath, content) { // For RSS files, we load the first document from the feed const documents = await this.loadFromString(content.toString("utf-8"), filePath); if (documents.length === 0) { throw new Error(`No documents found in RSS feed: ${filePath}`); } return documents[0]; } async loadFromURL(url) { try { const feed = await this.parser.parseURL(url); return this.processFeed(feed, url); } catch (error) { throw new Error(`Failed to load RSS feed from ${url}: ${error}`); } } async loadFromString(rssContent, sourceUrl) { try { const feed = await this.parser.parseString(rssContent); return this.processFeed(feed, sourceUrl); } catch (error) { throw new Error(`Failed to parse RSS content: ${error}`); } } processFeed(feed, sourceUrl) { const documents = []; const items = this.config.maxItems ? feed.items.slice(0, this.config.maxItems) : feed.items; for (const item of items) { const document = this.createDocumentFromItem(item, feed, sourceUrl); if (document) { documents.push(document); } } return documents; } createDocumentFromItem(item, feed, sourceUrl) { const title = item.title || "Untitled"; const link = item.link || ""; // Extract content from various possible fields let content = ""; if (this.config.includeContent) { content = item.contentEncoded || item.content || item.description || item.contentSnippet || ""; } else { content = item.contentSnippet || item.description || ""; } if (!content.trim()) { return null; // Skip items without content } // Clean HTML tags from content if present const cleanContent = this.cleanHtmlContent(content); const fullContent = `${title}\n\n${cleanContent}`; const metadata = { title, author: item.author || feed.title || "Unknown", createdAt: item.pubDate ? new Date(item.pubDate) : new Date(), updatedAt: new Date(), fileType: "rss", fileSize: fullContent.length, language: this.detectLanguage(fullContent), tags: item.categories || [], categories: item.categories || [], description: item.contentSnippet || item.description || "", }; // Add RSS-specific metadata metadata.rssSource = { feedUrl: sourceUrl, feedTitle: feed.title, itemLink: link, pubDate: item.pubDate, guid: item.guid, }; return { id: this.generateDocumentId(link || item.guid || title), content: fullContent, metadata, source: link || sourceUrl, }; } cleanHtmlContent(html) { // Remove HTML tags and decode HTML entities return html .replace(/<[^>]*>/g, "") // Remove HTML tags .replace(/&nbsp;/g, " ") .replace(/&amp;/g, "&") .replace(/&lt;/g, "<") .replace(/&gt;/g, ">") .replace(/&quot;/g, '"') .replace(/&#39;/g, "'") .replace(/\s+/g, " ") // Normalize whitespace .trim(); } detectLanguage(content) { // Simple language detection - Korean vs English const koreanPattern = /[ㄱ-ㅎ가-힣]/; return koreanPattern.test(content) ? "ko" : "en"; } generateDocumentId(identifier) { // Create a stable ID from the identifier const timestamp = Date.now(); const hash = identifier.split("").reduce((a, b) => { a = (a << 5) - a + b.charCodeAt(0); return a & a; }, 0); return `rss_${Math.abs(hash)}_${timestamp}`; } } // Specialized loader for Naver Blog RSS class NaverBlogRSSLoader extends RSSLoader { constructor(blogId, config = {}) { super(config); this.blogId = blogId; } async loadBlog() { const rssUrl = `https://rss.blog.naver.com/${this.blogId}`; return this.loadFromURL(rssUrl); } getBlogId() { return this.blogId; } getRSSUrl() { return `https://rss.blog.naver.com/${this.blogId}`; } static createFromUrl(url, config = {}) { // Extract blog ID from various URL formats const blogId = this.extractBlogId(url); if (!blogId) { throw new Error(`Cannot extract blog ID from URL: ${url}`); } return new NaverBlogRSSLoader(blogId, config); } static extractBlogId(url) { // Handle various Naver blog URL formats const patterns = [ /rss\.blog\.naver\.com\/([^\/\?]+)/, // RSS URL /blog\.naver\.com\/([^\/\?]+)/, // Blog URL /([^\/\?]+)\.blog\.me/, // blog.me URL ]; for (const pattern of patterns) { const match = url.match(pattern); if (match && match[1]) { return match[1]; } } // If no pattern matches, assume the URL itself is the blog ID const cleanUrl = url.replace(/^https?:\/\//, "").replace(/\/$/, ""); if (cleanUrl && !cleanUrl.includes("/") && !cleanUrl.includes(".")) { return cleanUrl; } return null; } } // Generic RSS feed manager for multiple sources class RSSFeedManager { constructor() { this.feedSources = new Map(); } addFeed(name, loader) { this.feedSources.set(name, loader); } removeFeed(name) { return this.feedSources.delete(name); } async loadAllFeeds() { const results = new Map(); const promises = Array.from(this.feedSources.entries()).map(async ([name, loader]) => { try { const documents = loader instanceof NaverBlogRSSLoader ? await loader.loadBlog() : await loader.loadFromURL(""); // This would need to be configured results.set(name, documents); } catch (error) { console.error(`Failed to load feed ${name}:`, error); results.set(name, []); } }); await Promise.all(promises); return results; } getFeedNames() { return Array.from(this.feedSources.keys()); } getFeed(name) { return this.feedSources.get(name); } } export { NaverBlogRSSLoader, RSSFeedManager, RSSLoader }; //# sourceMappingURL=rss.js.map