UNPKG

apache-autoindex-parse

Version:

parse apache's autoindex html files

github.com/luxass/apache-autoindex-parse

luxass/apache-autoindex-parse

165 lines (163 loc) • 5.56 kB

JavaScript

//#region src/lib.ts function trimTrailingSlash(path) { if (path === "/") return path; return path.endsWith("/") ? path.slice(0, -1) : path; } function trimLeadingSlash(path) { if (path.startsWith("/")) return path.slice(1); return path; } //#endregion //#region src/index.ts /** * Parses HTML content of an auto-indexed directory listing into a structured format. * * @param {string} html - The HTML content of the auto-indexed directory page to parse * @param {AutoIndexFormat?} format - Optional format specification of the auto-index page (will be inferred if not provided) * @returns {Entry[]} An array of entries representing the parsed directory structure, or empty array if parsing fails * * @example * ```ts * import { parse } from 'apache-autoindex-parse'; * * const html = await fetch('http://example.com/files/').then(res => res.text()); * const result = parse(html); * console.log(result); // Array of file and directory entries * ``` */ function parse(html, format) { let entries = []; if (!html) return entries; if (!format) format = inferFormat(html); if (format === "F0") entries = parseF0(html); if (format === "F1") entries = parseF1(html); if (format === "F2") entries = parseF2(html); return entries; } /** * Infers the AutoIndexFormat from HTML content. * * This function examines the links on the page to determine the format * of an Apache AutoIndex page. It looks for URL parameters that indicate * the format (e.g., "F=2" in "?C=N;O=D;F=2"). * * @param {string} html - The HTML content to analyze * @returns {AutoIndexFormat} The inferred format as an AutoIndexFormat string (e.g., "F0", "F1", "F2", etc.) */ function inferFormat(html) { const formatMatch = html.match(/href="[^"]*[?&]F=(\d)[^"]*"/); if (formatMatch && formatMatch[1]) return `F${formatMatch[1]}`; const hasPre = /<pre[^>]*>/.test(html); const hasTable = /<table[^>]*>/.test(html); if (hasPre) return "F1"; if (hasTable) return "F2"; return "F0"; } function parseF0(html) { const entries = []; const linkRegex = /<li[^>]*>\s*<a\s+href="([^"]*)"[^>]*>([^<]+)<\/a>\s*<\/li>/gi; let match; while ((match = linkRegex.exec(html)) !== null) { const [, href, name] = match; if (!href || !name || name.trim() === "Parent Directory") continue; const cleanName = name.trim(); if (href.endsWith("/")) entries.push({ type: "directory", name: trimTrailingSlash(cleanName), path: trimTrailingSlash(href), lastModified: void 0 }); else entries.push({ type: "file", name: trimTrailingSlash(cleanName), path: trimTrailingSlash(href), lastModified: void 0 }); } return entries; } function parseF1(html) { const entries = []; const preMatch = html.match(/<pre[^>]*>([\s\S]*?)<\/pre>/i); if (!preMatch || !preMatch[1]) return entries; const preContent = preMatch[1]; const linkRegex = /<a\s+href="([^"]*)"[^>]*>([^<]*)<\/a>([^<\n]*)/gi; let match; while ((match = linkRegex.exec(preContent)) !== null) { const [, href, name, afterText] = match; if (!href || !name || name.trim() === "Parent Directory" || href.startsWith("?")) continue; const cleanName = name.trim(); const isDirectory = href.endsWith("/"); let lastModified; if (afterText) { const dateMatch = afterText.match(/(\d{2}-\w{3}-\d{4}\s+\d{2}:\d{2}|\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/); if (dateMatch && dateMatch[0]) { const date = new Date(dateMatch[0]); if (!Number.isNaN(date.getTime())) lastModified = date.getTime(); } } if (isDirectory) entries.push({ type: "directory", name: trimTrailingSlash(cleanName), path: trimTrailingSlash(href), lastModified }); else entries.push({ type: "file", name: trimTrailingSlash(cleanName), path: trimTrailingSlash(href), lastModified }); } return entries; } function parseF2(html) { const entries = []; const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let match; while ((match = rowRegex.exec(html)) !== null) { const rowContent = match[1]; if (!rowContent) continue; if (/<th[^>]*>/.test(rowContent) || /<hr[^>]*>/.test(rowContent)) continue; const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cells = []; let cellMatch; while ((cellMatch = cellRegex.exec(rowContent)) !== null) if (cellMatch[1] !== void 0) cells.push(cellMatch[1]); if (cells.length < 3) continue; const iconCell = cells[0]; const linkCell = cells[1]; const dateCell = cells[2]; if (iconCell && /alt="\[PARENTDIR\]"/.test(iconCell)) continue; if (!linkCell) continue; const linkMatch = linkCell.match(/<a\s+href="([^"]*)"[^>]*>([^<]*)<\/a>/i); if (!linkMatch || !linkMatch[1] || !linkMatch[2]) continue; const [, href, name] = linkMatch; if (href === "/" || name.trim() === "Parent Directory") continue; const cleanName = name.trim(); if (!cleanName) continue; const isDirectory = iconCell && /alt="\[DIR\]"/.test(iconCell) || href.endsWith("/"); let lastModified; if (dateCell) { const dateText = dateCell.replace(/<[^>]*>/g, "").trim(); if (dateText && dateText !== "\xA0" && dateText !== " ") { const date = new Date(dateText); if (!Number.isNaN(date.getTime())) lastModified = date.getTime(); } } if (isDirectory) entries.push({ type: "directory", name: trimTrailingSlash(cleanName), path: trimTrailingSlash(href), lastModified }); else entries.push({ type: "file", name: trimTrailingSlash(cleanName), path: trimTrailingSlash(href), lastModified }); } return entries; } //#endregion export { trimTrailingSlash as i, parse as n, trimLeadingSlash as r, inferFormat as t };