@casoon/auditmysite
Version:
Professional website analysis suite with robust accessibility testing, Core Web Vitals performance monitoring, SEO analysis, and content optimization insights. Features isolated browser contexts, retry mechanisms, and comprehensive API endpoints for profe
117 lines • 4.93 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.SitemapParser = void 0;
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const fast_xml_parser_1 = require("fast-xml-parser");
class SitemapParser {
constructor() {
this.parser = new fast_xml_parser_1.XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
});
}
async parseSitemap(sitemapUrl) {
let xml;
// Lade XML von URL oder Datei
if (sitemapUrl.startsWith("http")) {
const response = await fetch(sitemapUrl);
if (!response.ok) {
throw new Error(`Failed to fetch sitemap: ${response.statusText}`);
}
xml = await response.text();
}
else {
xml = fs_1.default.readFileSync(path_1.default.resolve(sitemapUrl), "utf-8");
}
const parsed = this.parser.parse(xml);
const urls = [];
// Fall 1: Sitemap Index (WordPress/multi-sitemap structure)
if (parsed.sitemapindex && parsed.sitemapindex.sitemap) {
console.log(`📋 Found sitemap index with ${Array.isArray(parsed.sitemapindex.sitemap) ? parsed.sitemapindex.sitemap.length : 1} sub-sitemaps`);
const sitemaps = Array.isArray(parsed.sitemapindex.sitemap)
? parsed.sitemapindex.sitemap
: [parsed.sitemapindex.sitemap];
// Fetch URLs from each sub-sitemap (limit to first 10 for performance)
const sitemapsToProcess = sitemaps.slice(0, 10);
for (const sitemap of sitemapsToProcess) {
try {
const subSitemapUrl = sitemap.loc;
if (subSitemapUrl && subSitemapUrl !== sitemapUrl) { // Avoid infinite loops
console.log(` 📄 Processing sub-sitemap: ${subSitemapUrl}`);
const subUrls = await this.parseSitemap(subSitemapUrl); // Recursive call
urls.push(...subUrls);
}
}
catch (error) {
console.warn(` ⚠️ Failed to process sub-sitemap ${sitemap.loc}: ${error}`);
// Continue with other sitemaps even if one fails
}
}
if (sitemaps.length > 10) {
console.log(` 📊 Limited processing to first 10 of ${sitemaps.length} sub-sitemaps for performance`);
}
return urls;
}
// Fall 2: Standard sitemap.xml Struktur
if (parsed.urlset && parsed.urlset.url) {
if (Array.isArray(parsed.urlset.url)) {
urls.push(...parsed.urlset.url.map((u) => ({
loc: u.loc,
lastmod: u.lastmod,
changefreq: u.changefreq,
priority: u.priority,
})));
}
else {
urls.push({
loc: parsed.urlset.url.loc,
lastmod: parsed.urlset.url.lastmod,
changefreq: parsed.urlset.url.changefreq,
priority: parsed.urlset.url.priority,
});
}
}
// Fall 3: Falls die URLs im #text Feld sind (wie bei Astro)
if (urls.length === 0 && parsed.urlset && parsed.urlset["#text"]) {
const textContent = parsed.urlset["#text"];
const urlMatches = textContent.match(/<loc>(.*?)<\/loc>/g);
if (urlMatches) {
urls.push(...urlMatches.map((match) => ({
loc: match.replace(/<\/?loc>/g, ""),
})));
}
}
return urls;
}
filterUrls(urls, options) {
let filtered = urls;
// Filtere nach Ausschluss-Mustern
if (options.filterPatterns) {
filtered = filtered.filter((url) => !options.filterPatterns.some((pattern) => url.loc.includes(pattern)));
}
// Filtere nach Einschluss-Mustern
if (options.includePatterns) {
filtered = filtered.filter((url) => options.includePatterns.some((pattern) => url.loc.includes(pattern)));
}
return filtered;
}
convertToLocalUrls(urls, baseUrl) {
return urls.map((url) => ({
...url,
loc: this.convertUrlToLocal(url.loc, baseUrl),
}));
}
convertUrlToLocal(url, baseUrl) {
// Extrahiere Domain aus der URL
const urlObj = new URL(url);
const domain = urlObj.origin;
// Ersetze Domain durch baseUrl
return url.replace(domain, baseUrl);
}
}
exports.SitemapParser = SitemapParser;
//# sourceMappingURL=sitemap-parser.js.map