embeddings-js
Version:
A NodeJS RAG framework to easily work with LLMs and custom datasets
47 lines (46 loc) • 1.61 kB
JavaScript
import md5 from 'md5';
import Sitemapper from 'sitemapper';
import createDebugMessages from 'debug';
import { BaseLoader } from '../interfaces/base-loader.js';
import { WebLoader } from './web-loader.js';
export class SitemapLoader extends BaseLoader {
constructor({ url }) {
super(`SitemapLoader_${md5(url)}`);
Object.defineProperty(this, "debug", {
enumerable: true,
configurable: true,
writable: true,
value: createDebugMessages('embedjs:loader:SitemapLoader')
});
Object.defineProperty(this, "url", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.url = url;
}
async *getChunks() {
try {
// @ts-ignore
const { sites } = await new Sitemapper({ url: this.url, timeout: 15000 }).fetch();
this.debug(`Sitemap '${this.url}' returned ${sites.length} URLs`);
for (const url of sites) {
const webLoader = new WebLoader({ url });
for await (const chunk of webLoader.getChunks()) {
yield {
...chunk,
metadata: {
...chunk.metadata,
type: 'SitemapLoader',
originalSource: this.url,
},
};
}
}
}
catch (e) {
this.debug('Could not get sites from sitemap url', this.url, e);
}
}
}