UNPKG

apache-autoindex-parse

Version:

parse apache's autoindex html files

github.com/luxass/apache-autoindex-parse

luxass/apache-autoindex-parse

213 lines (209 loc) • 6.65 kB

JavaScript

'use strict'; // src/index.ts function parse(html, format) { const titleMatch = html.match(/<title[^>]*>(.*?)<\/title>/i); const titleText = titleMatch?.[1] || ""; const rootPath = titleText.split("Index of ")[1] ?? "/"; let entries = []; if (!format) { format = inferFormat(html); } if (format === "F0") { entries = parseF0(html, rootPath); } if (format === "F1") { entries = parseF1(html, rootPath); } if (format === "F2") { entries = parseF2(html, rootPath); } return { type: "directory", path: rootPath, children: entries }; } function inferFormat(html) { const hrefRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>/gi; const hrefs = Array.from(html.matchAll(hrefRegex)).map((match) => match[1]).filter((href) => href !== void 0 && href.startsWith("?")); for (const href of hrefs) { const formatMatch = href.match(/F=(\d)/); if (formatMatch && formatMatch[1]) { return `F${formatMatch[1]}`; } } return "F0"; } function parseF0(html, rootPath) { const entries = []; const ulMatch = html.match(/<ul[^>]*>([\s\S]*?)<\/ul>/i); if (!ulMatch || !ulMatch[1]) return entries; const ulContent = ulMatch[1]; const liRegex = /<li[^>]*>[\s\S]*?<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>[\s\S]*?<\/li>/gi; const matches = Array.from(ulContent.matchAll(liRegex)); for (const match of matches) { const href = match[1] || ""; const name = (match[2] || "").trim(); const isDirectory = href.endsWith("/"); if (name === "Parent Directory") { continue; } const path = /^https?:\/\//.test(href) ? href : `${rootPath}/${href}`; if (isDirectory) { entries.push({ type: "directory", name: name.slice(0, -1), path, lastModified: void 0, children: [] }); } else { entries.push({ type: "file", name, path, lastModified: void 0 }); } } return entries; } function parseF1(html, rootPath) { const entries = []; const preRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi; const preMatches = Array.from(html.matchAll(preRegex)); for (const preMatch of preMatches) { if (!preMatch[1]) continue; const preContent = preMatch[1]; const entryRegex = /<img[^>]+alt=["'](\[(?:DIR|TXT)\])["'][^>]*>[\s\S]*?<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>([\s\S]*?)(?=<img|$)/gi; const entryMatches = Array.from(preContent.matchAll(entryRegex)); for (const entryMatch of entryMatches) { const imgAlt = entryMatch[1] || ""; const href = entryMatch[2] || ""; const text = (entryMatch[3] || "").trim(); const rowText = entryMatch[4] || ""; if (href === "/Public/") continue; if (imgAlt === "[DIR]" || imgAlt === "[TXT]") { const type = imgAlt === "[DIR]" ? "directory" : "file"; const dateMatch = rowText.match(/\d{4}-\d{2}-\d{2} \d{2}:\d{2}/); let lastModified; if (dateMatch && dateMatch[0]) { const dateString = dateMatch[0]; const date = /* @__PURE__ */ new Date(`${dateString.replace(" ", "T")}:00Z`); if (!Number.isNaN(date.getTime())) { lastModified = date.getTime(); } } const path = /^https?:\/\//.test(href) ? href : `${rootPath}/${href}`; if (type === "directory") { entries.push({ type, name: text, path, lastModified, children: [] }); } else { entries.push({ type, name: text, path, lastModified }); } } } } return entries; } function parseF2(html, rootPath) { const entries = []; const tableMatch = html.match(/<table[^>]*>([\s\S]*?)<\/table>/i); if (!tableMatch || !tableMatch[1]) return entries; const tableContent = tableMatch[1]; const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; const rowMatches = Array.from(tableContent.matchAll(rowRegex)); for (const rowMatch of rowMatches) { if (!rowMatch[1]) continue; const rowContent = rowMatch[1]; if (rowContent.includes("<th") || rowContent.includes("<hr")) { continue; } const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cellMatches = Array.from(rowContent.matchAll(cellRegex)); const cells = cellMatches.map((match) => match[1] || ""); if (cells.length < 3) continue; const imgAltMatch = cells[0]?.match(/<img[^>]+alt=["'](\[[^\]]+\])["'][^>]*>/i); if (!imgAltMatch || !imgAltMatch[1]) continue; const imgAlt = imgAltMatch[1]; if (imgAlt !== "[DIR]" && imgAlt !== "[TXT]") { if (imgAlt === "[PARENTDIR]") continue; if (!imgAlt.startsWith("[")) continue; } const linkMatch = cells[1]?.match(/<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>/i); if (!linkMatch || !linkMatch[1]) continue; const href = linkMatch[1] || ""; const name = (linkMatch[2] || "").trim(); if (name === "Parent Directory") continue; const dateText = cells[2]?.replace(/<[^>]+>/g, "").trim(); let lastModified; if (dateText && dateText !== " ") { const date = new Date(dateText); if (!Number.isNaN(date.getTime())) { lastModified = date.getTime(); } } const type = imgAlt === "[DIR]" || href.endsWith("/") ? "directory" : "file"; const path = /^https?:\/\//.test(href) ? href : `${rootPath}/${href}`; if (type === "directory") { entries.push({ type, name, path, lastModified, children: [] }); } else { entries.push({ type, name, path, lastModified }); } } return entries; } // src/traverse.ts async function traverse(rootUrl, options) { try { const res = await fetch(rootUrl, { headers: { "User-Agent": "github.com/apache-autoindex-parse", ...options?.extraHeaders }, signal: options?.abortSignal }); if (!res.ok) { throw new Error(`failed to fetch directory listing from ${rootUrl}: ${res.status} ${res.statusText}`); } const html = await res.text(); const root = parse(html, options?.format); if (!root) return null; await Promise.all( root.children.map(async (entry) => { if (entry.type === "directory") { const childUrl = new URL(entry.path, rootUrl).href; const child = await traverse(childUrl, options); if (child) { entry.children = child.children; } } }) ); return root; } catch { return null; } } exports.traverse = traverse;