@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
306 lines (264 loc) • 8.08 kB
text/typescript
import Parser from "rss-parser";
import { BaseDocumentLoader } from "./base";
import { Document, DocumentMetadata } from "../types";
export interface RSSLoaderConfig {
maxItems?: number;
includeContent?: boolean;
requestOptions?: {
timeout?: number;
headers?: Record<string, string>;
};
}
export interface RSSItem {
title?: string;
link?: string;
content?: string;
contentSnippet?: string;
author?: string;
pubDate?: string;
categories?: string[];
guid?: string;
}
export interface RSSFeed {
title?: string;
description?: string;
link?: string;
language?: string;
lastBuildDate?: string;
items: RSSItem[];
}
export class RSSLoader extends BaseDocumentLoader {
supportedExtensions: string[] = ["rss", "xml"];
private parser: Parser;
private config: RSSLoaderConfig;
constructor(config: RSSLoaderConfig = {}) {
super();
this.config = {
maxItems: 50,
includeContent: true,
requestOptions: {
timeout: 10000,
headers: {
"User-Agent": "Mozilla/5.0 (compatible; RAG-System/1.0)",
},
},
...config,
};
this.parser = new Parser({
timeout: this.config.requestOptions?.timeout,
headers: this.config.requestOptions?.headers,
customFields: {
item: [
["content:encoded", "contentEncoded"],
["description", "description"],
],
},
});
}
async load(filePath: string, content: Buffer): Promise<Document> {
// For RSS files, we load the first document from the feed
const documents = await this.loadFromString(
content.toString("utf-8"),
filePath
);
if (documents.length === 0) {
throw new Error(`No documents found in RSS feed: ${filePath}`);
}
return documents[0];
}
async loadFromURL(url: string): Promise<Document[]> {
try {
const feed = await this.parser.parseURL(url);
return this.processFeed(feed, url);
} catch (error) {
throw new Error(`Failed to load RSS feed from ${url}: ${error}`);
}
}
async loadFromString(
rssContent: string,
sourceUrl: string
): Promise<Document[]> {
try {
const feed = await this.parser.parseString(rssContent);
return this.processFeed(feed, sourceUrl);
} catch (error) {
throw new Error(`Failed to parse RSS content: ${error}`);
}
}
private processFeed(feed: any, sourceUrl: string): Document[] {
const documents: Document[] = [];
const items = this.config.maxItems
? feed.items.slice(0, this.config.maxItems)
: feed.items;
for (const item of items) {
const document = this.createDocumentFromItem(item, feed, sourceUrl);
if (document) {
documents.push(document);
}
}
return documents;
}
private createDocumentFromItem(
item: any,
feed: any,
sourceUrl: string
): Document | null {
const title = item.title || "Untitled";
const link = item.link || "";
// Extract content from various possible fields
let content = "";
if (this.config.includeContent) {
content =
item.contentEncoded ||
item.content ||
item.description ||
item.contentSnippet ||
"";
} else {
content = item.contentSnippet || item.description || "";
}
if (!content.trim()) {
return null; // Skip items without content
}
// Clean HTML tags from content if present
const cleanContent = this.cleanHtmlContent(content);
const fullContent = `${title}\n\n${cleanContent}`;
const metadata: DocumentMetadata = {
title,
author: item.author || feed.title || "Unknown",
createdAt: item.pubDate ? new Date(item.pubDate) : new Date(),
updatedAt: new Date(),
fileType: "rss",
fileSize: fullContent.length,
language: this.detectLanguage(fullContent),
tags: item.categories || [],
categories: item.categories || [],
description: item.contentSnippet || item.description || "",
};
// Add RSS-specific metadata
(metadata as any).rssSource = {
feedUrl: sourceUrl,
feedTitle: feed.title,
itemLink: link,
pubDate: item.pubDate,
guid: item.guid,
};
return {
id: this.generateDocumentId(link || item.guid || title),
content: fullContent,
metadata,
source: link || sourceUrl,
};
}
private cleanHtmlContent(html: string): string {
// Remove HTML tags and decode HTML entities
return html
.replace(/<[^>]*>/g, "") // Remove HTML tags
.replace(/ /g, " ")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/\s+/g, " ") // Normalize whitespace
.trim();
}
private detectLanguage(content: string): string {
// Simple language detection - Korean vs English
const koreanPattern = /[ㄱ-ㅎ가-힣]/;
return koreanPattern.test(content) ? "ko" : "en";
}
protected generateDocumentId(identifier: string): string {
// Create a stable ID from the identifier
const timestamp = Date.now();
const hash = identifier.split("").reduce((a, b) => {
a = (a << 5) - a + b.charCodeAt(0);
return a & a;
}, 0);
return `rss_${Math.abs(hash)}_${timestamp}`;
}
}
// Specialized loader for Naver Blog RSS
export class NaverBlogRSSLoader extends RSSLoader {
private blogId: string;
constructor(blogId: string, config: RSSLoaderConfig = {}) {
super(config);
this.blogId = blogId;
}
async loadBlog(): Promise<Document[]> {
const rssUrl = `https://rss.blog.naver.com/${this.blogId}`;
return this.loadFromURL(rssUrl);
}
getBlogId(): string {
return this.blogId;
}
getRSSUrl(): string {
return `https://rss.blog.naver.com/${this.blogId}`;
}
static createFromUrl(
url: string,
config: RSSLoaderConfig = {}
): NaverBlogRSSLoader {
// Extract blog ID from various URL formats
const blogId = this.extractBlogId(url);
if (!blogId) {
throw new Error(`Cannot extract blog ID from URL: ${url}`);
}
return new NaverBlogRSSLoader(blogId, config);
}
static extractBlogId(url: string): string | null {
// Handle various Naver blog URL formats
const patterns = [
/rss\.blog\.naver\.com\/([^\/\?]+)/, // RSS URL
/blog\.naver\.com\/([^\/\?]+)/, // Blog URL
/([^\/\?]+)\.blog\.me/, // blog.me URL
];
for (const pattern of patterns) {
const match = url.match(pattern);
if (match && match[1]) {
return match[1];
}
}
// If no pattern matches, assume the URL itself is the blog ID
const cleanUrl = url.replace(/^https?:\/\//, "").replace(/\/$/, "");
if (cleanUrl && !cleanUrl.includes("/") && !cleanUrl.includes(".")) {
return cleanUrl;
}
return null;
}
}
// Generic RSS feed manager for multiple sources
export class RSSFeedManager {
private feedSources: Map<string, RSSLoader> = new Map();
addFeed(name: string, loader: RSSLoader): void {
this.feedSources.set(name, loader);
}
removeFeed(name: string): boolean {
return this.feedSources.delete(name);
}
async loadAllFeeds(): Promise<Map<string, Document[]>> {
const results = new Map<string, Document[]>();
const promises = Array.from(this.feedSources.entries()).map(
async ([name, loader]) => {
try {
const documents =
loader instanceof NaverBlogRSSLoader
? await loader.loadBlog()
: await loader.loadFromURL(""); // This would need to be configured
results.set(name, documents);
} catch (error) {
console.error(`Failed to load feed ${name}:`, error);
results.set(name, []);
}
}
);
await Promise.all(promises);
return results;
}
getFeedNames(): string[] {
return Array.from(this.feedSources.keys());
}
getFeed(name: string): RSSLoader | undefined {
return this.feedSources.get(name);
}
}