robots-parse
Version:
A lightweight and simple robots.txt parser in node.
84 lines (70 loc) • 2.27 kB
JavaScript
const patterns = {
agents: /([Uu]ser-agent:) (.+)/,
allow: /([Aa]llow:) (\/.+)/,
disallow: /([Dd]isallow:) (\/.+)/,
sitemaps: /([Ss]itemap:) (.+)/,
host: /([Hh]ost:) (.+)/
};
module.exports = body => {
const results = {
agents: {},
allow: [],
disallow: [],
sitemaps: [],
host: ''
};
// Extract lines from body response
const lines = body.match(/[^\r\n]+/g);
// Default agent
let lastAgent = 'all';
// Loop through lines and check for patterns
lines.forEach(line => {
// Check for agent rules
if (patterns.agents.test(line)) {
const matches = line.match(patterns.agents);
lastAgent = matches[2] === '*' ? 'all' : matches[2];
results.agents[lastAgent] = {
allow: [],
disallow: []
};
}
// Check for host rule
if (patterns.host.test(line)) {
const matches = line.match(patterns.host);
results.host = matches[2];
}
// Check for allow rules
if (patterns.allow.test(line)) {
const matches = line.match(patterns.allow);
// Add element if not already in
if (!results.agents[lastAgent].allow.includes(matches[2])) {
results.agents[lastAgent].allow.push(matches[2]);
}
// Add element to the global array if not already in
if (!results.allow.includes(matches[2])) {
results.allow.push(matches[2]);
}
}
// Check for disallow rules
if (patterns.disallow.test(line)) {
const matches = line.match(patterns.disallow);
// Add element if not already in
if (!results.agents[lastAgent].disallow.includes(matches[2])) {
results.agents[lastAgent].disallow.push(matches[2]);
}
// Add element to the global array if not already in
if (!results.disallow.includes(matches[2])) {
results.disallow.push(matches[2]);
}
}
// Check for sitemap rules
if (patterns.sitemaps.test(line)) {
const matches = line.match(patterns.sitemaps);
// Add element to the global array if not already in
if (!results.sitemaps.includes(matches[2])) {
results.sitemaps.push(matches[2]);
}
}
});
return results;
};