apache-autoindex-parse
Version:
parse apache's autoindex html files
213 lines (209 loc) • 6.65 kB
JavaScript
;
// src/index.ts
function parse(html, format) {
const titleMatch = html.match(/<title[^>]*>(.*?)<\/title>/i);
const titleText = titleMatch?.[1] || "";
const rootPath = titleText.split("Index of ")[1] ?? "/";
let entries = [];
if (!format) {
format = inferFormat(html);
}
if (format === "F0") {
entries = parseF0(html, rootPath);
}
if (format === "F1") {
entries = parseF1(html, rootPath);
}
if (format === "F2") {
entries = parseF2(html, rootPath);
}
return {
type: "directory",
path: rootPath,
children: entries
};
}
function inferFormat(html) {
const hrefRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>/gi;
const hrefs = Array.from(html.matchAll(hrefRegex)).map((match) => match[1]).filter((href) => href !== void 0 && href.startsWith("?"));
for (const href of hrefs) {
const formatMatch = href.match(/F=(\d)/);
if (formatMatch && formatMatch[1]) {
return `F${formatMatch[1]}`;
}
}
return "F0";
}
function parseF0(html, rootPath) {
const entries = [];
const ulMatch = html.match(/<ul[^>]*>([\s\S]*?)<\/ul>/i);
if (!ulMatch || !ulMatch[1]) return entries;
const ulContent = ulMatch[1];
const liRegex = /<li[^>]*>[\s\S]*?<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>[\s\S]*?<\/li>/gi;
const matches = Array.from(ulContent.matchAll(liRegex));
for (const match of matches) {
const href = match[1] || "";
const name = (match[2] || "").trim();
const isDirectory = href.endsWith("/");
if (name === "Parent Directory") {
continue;
}
const path = /^https?:\/\//.test(href) ? href : `${rootPath}/${href}`;
if (isDirectory) {
entries.push({
type: "directory",
name: name.slice(0, -1),
path,
lastModified: void 0,
children: []
});
} else {
entries.push({
type: "file",
name,
path,
lastModified: void 0
});
}
}
return entries;
}
function parseF1(html, rootPath) {
const entries = [];
const preRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi;
const preMatches = Array.from(html.matchAll(preRegex));
for (const preMatch of preMatches) {
if (!preMatch[1]) continue;
const preContent = preMatch[1];
const entryRegex = /<img[^>]+alt=["'](\[(?:DIR|TXT)\])["'][^>]*>[\s\S]*?<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>([\s\S]*?)(?=<img|$)/gi;
const entryMatches = Array.from(preContent.matchAll(entryRegex));
for (const entryMatch of entryMatches) {
const imgAlt = entryMatch[1] || "";
const href = entryMatch[2] || "";
const text = (entryMatch[3] || "").trim();
const rowText = entryMatch[4] || "";
if (href === "/Public/") continue;
if (imgAlt === "[DIR]" || imgAlt === "[TXT]") {
const type = imgAlt === "[DIR]" ? "directory" : "file";
const dateMatch = rowText.match(/\d{4}-\d{2}-\d{2} \d{2}:\d{2}/);
let lastModified;
if (dateMatch && dateMatch[0]) {
const dateString = dateMatch[0];
const date = /* @__PURE__ */ new Date(`${dateString.replace(" ", "T")}:00Z`);
if (!Number.isNaN(date.getTime())) {
lastModified = date.getTime();
}
}
const path = /^https?:\/\//.test(href) ? href : `${rootPath}/${href}`;
if (type === "directory") {
entries.push({
type,
name: text,
path,
lastModified,
children: []
});
} else {
entries.push({
type,
name: text,
path,
lastModified
});
}
}
}
}
return entries;
}
function parseF2(html, rootPath) {
const entries = [];
const tableMatch = html.match(/<table[^>]*>([\s\S]*?)<\/table>/i);
if (!tableMatch || !tableMatch[1]) return entries;
const tableContent = tableMatch[1];
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
const rowMatches = Array.from(tableContent.matchAll(rowRegex));
for (const rowMatch of rowMatches) {
if (!rowMatch[1]) continue;
const rowContent = rowMatch[1];
if (rowContent.includes("<th") || rowContent.includes("<hr")) {
continue;
}
const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi;
const cellMatches = Array.from(rowContent.matchAll(cellRegex));
const cells = cellMatches.map((match) => match[1] || "");
if (cells.length < 3) continue;
const imgAltMatch = cells[0]?.match(/<img[^>]+alt=["'](\[[^\]]+\])["'][^>]*>/i);
if (!imgAltMatch || !imgAltMatch[1]) continue;
const imgAlt = imgAltMatch[1];
if (imgAlt !== "[DIR]" && imgAlt !== "[TXT]") {
if (imgAlt === "[PARENTDIR]") continue;
if (!imgAlt.startsWith("[")) continue;
}
const linkMatch = cells[1]?.match(/<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>/i);
if (!linkMatch || !linkMatch[1]) continue;
const href = linkMatch[1] || "";
const name = (linkMatch[2] || "").trim();
if (name === "Parent Directory") continue;
const dateText = cells[2]?.replace(/<[^>]+>/g, "").trim();
let lastModified;
if (dateText && dateText !== " ") {
const date = new Date(dateText);
if (!Number.isNaN(date.getTime())) {
lastModified = date.getTime();
}
}
const type = imgAlt === "[DIR]" || href.endsWith("/") ? "directory" : "file";
const path = /^https?:\/\//.test(href) ? href : `${rootPath}/${href}`;
if (type === "directory") {
entries.push({
type,
name,
path,
lastModified,
children: []
});
} else {
entries.push({
type,
name,
path,
lastModified
});
}
}
return entries;
}
// src/traverse.ts
async function traverse(rootUrl, options) {
try {
const res = await fetch(rootUrl, {
headers: {
"User-Agent": "github.com/apache-autoindex-parse",
...options?.extraHeaders
},
signal: options?.abortSignal
});
if (!res.ok) {
throw new Error(`failed to fetch directory listing from ${rootUrl}: ${res.status} ${res.statusText}`);
}
const html = await res.text();
const root = parse(html, options?.format);
if (!root) return null;
await Promise.all(
root.children.map(async (entry) => {
if (entry.type === "directory") {
const childUrl = new URL(entry.path, rootUrl).href;
const child = await traverse(childUrl, options);
if (child) {
entry.children = child.children;
}
}
})
);
return root;
} catch {
return null;
}
}
exports.traverse = traverse;