UNPKG

lighthouse

Version:

Automated auditing, performance metrics, and best practices for the web.

256 lines (215 loc) • 8.37 kB
/** * @license * Copyright 2018 Google LLC * SPDX-License-Identifier: Apache-2.0 */ /** * @fileoverview Validates robots.txt file according to the official standard and its various * extensions respected by the popular web crawlers. * Validator rules, and the resources backing these rules, can be found here: * https://github.com/GoogleChrome/lighthouse/issues/4356#issuecomment-375489925 */ import {Audit} from '../audit.js'; import * as i18n from '../../lib/i18n/i18n.js'; const HTTP_CLIENT_ERROR_CODE_LOW = 400; const HTTP_SERVER_ERROR_CODE_LOW = 500; const DIRECTIVE_SITEMAP = 'sitemap'; const DIRECTIVE_USER_AGENT = 'user-agent'; const DIRECTIVE_ALLOW = 'allow'; const DIRECTIVE_DISALLOW = 'disallow'; const DIRECTIVES_GROUP_MEMBERS = new Set([DIRECTIVE_ALLOW, DIRECTIVE_DISALLOW]); const DIRECTIVE_SAFELIST = new Set([ DIRECTIVE_USER_AGENT, DIRECTIVE_DISALLOW, // standard DIRECTIVE_ALLOW, DIRECTIVE_SITEMAP, // universally supported 'crawl-delay', // yahoo, bing, yandex 'clean-param', 'host', // yandex 'request-rate', 'visit-time', 'noindex', // not officially supported, but used in the wild ]); const SITEMAP_VALID_PROTOCOLS = new Set(['https:', 'http:', 'ftp:']); const UIStrings = { /** Title of a Lighthouse audit that provides detail on the site's robots.txt file. Note: "robots.txt" is a canonical filename and should not be translated. This descriptive title is shown when the robots.txt file is present and configured correctly. */ title: 'robots.txt is valid', /** Title of a Lighthouse audit that provides detail on the site's robots.txt file. Note: "robots.txt" is a canonical filename and should not be translated. This descriptive title is shown when the robots.txt file is misconfigured, which makes the page hard or impossible to scan via web crawler. */ failureTitle: 'robots.txt is not valid', /** Description of a Lighthouse audit that tells the user *why* they need to have a valid robots.txt file. Note: "robots.txt" is a canonical filename and should not be translated. This is displayed after a user expands the section to see more. No character length limits. */ description: 'If your robots.txt file is malformed, crawlers may not be able to understand ' + 'how you want your website to be crawled or indexed. ' + '[Learn more about robots.txt](https://developer.chrome.com/docs/lighthouse/seo/invalid-robots-txt/).', /** * @description Label for the audit identifying that the robots.txt request has returned a specific HTTP status code. Note: "robots.txt" is a canonical filename and should not be translated. * @example {500} statusCode * */ displayValueHttpBadCode: 'Request for robots.txt returned HTTP status: {statusCode}', /** [ICU Syntax] Label for the audit identifying the number of errors that occured while validating the robots.txt file. "itemCount" will be replaced by the integer count of errors encountered. */ displayValueValidationError: `{itemCount, plural, =1 {1 error found} other {# errors found} }`, /** Explanatory message stating that there was a failure in an audit caused by Lighthouse not being able to download the robots.txt file for the site. Note: "robots.txt" is a canonical filename and should not be translated. */ explanation: 'Lighthouse was unable to download a robots.txt file', }; const str_ = i18n.createIcuMessageFn(import.meta.url, UIStrings); /** * @param {string} directiveName * @param {string} directiveValue * @throws will throw an exception if given directive is invalid */ function verifyDirective(directiveName, directiveValue) { if (!DIRECTIVE_SAFELIST.has(directiveName)) { throw new Error('Unknown directive'); } if (directiveName === DIRECTIVE_SITEMAP) { let sitemapUrl; try { sitemapUrl = new URL(directiveValue); } catch (e) { throw new Error('Invalid sitemap URL'); } if (!SITEMAP_VALID_PROTOCOLS.has(sitemapUrl.protocol)) { throw new Error('Invalid sitemap URL protocol'); } } if (directiveName === DIRECTIVE_USER_AGENT && !directiveValue) { throw new Error('No user-agent specified'); } if (directiveName === DIRECTIVE_ALLOW || directiveName === DIRECTIVE_DISALLOW) { if (directiveValue !== '' && directiveValue[0] !== '/' && directiveValue[0] !== '*') { throw new Error('Pattern should either be empty, start with "/" or "*"'); } const dollarIndex = directiveValue.indexOf('$'); if (dollarIndex !== -1 && dollarIndex !== directiveValue.length - 1) { throw new Error('"$" should only be used at the end of the pattern'); } } } /** * @param {string} line single line from a robots.txt file * @throws will throw an exception if given line has errors * @return {{directive: string, value: string}|null} */ function parseLine(line) { const hashIndex = line.indexOf('#'); if (hashIndex !== -1) { line = line.substr(0, hashIndex); } line = line.trim(); if (line.length === 0) { return null; } const colonIndex = line.indexOf(':'); if (colonIndex === -1) { throw new Error('Syntax not understood'); } const directiveName = line.slice(0, colonIndex).trim().toLowerCase(); const directiveValue = line.slice(colonIndex + 1).trim(); verifyDirective(directiveName, directiveValue); return { directive: directiveName, value: directiveValue, }; } /** * @param {string} content * @return {Array<{index: string, line: string, message: string}>} */ function validateRobots(content) { /** * @type Array<{index: string, line: string, message: string}> */ const errors = []; let inGroup = false; content .split(/\r\n|\r|\n/) .forEach((line, index) => { let parsedLine; try { parsedLine = parseLine(line); } catch (e) { errors.push({ index: (index + 1).toString(), line: line, message: e.message.toString(), }); } if (!parsedLine) { return; } // group-member records (allow, disallow) have to be precided with a start-of-group record (user-agent) // see: https://developers.google.com/search/reference/robots_txt#grouping-of-records if (parsedLine.directive === DIRECTIVE_USER_AGENT) { inGroup = true; } else if (!inGroup && DIRECTIVES_GROUP_MEMBERS.has(parsedLine.directive)) { errors.push({ index: (index + 1).toString(), line: line, message: 'No user-agent specified', }); } }); return errors; } class RobotsTxt extends Audit { /** * @return {LH.Audit.Meta} */ static get meta() { return { id: 'robots-txt', title: str_(UIStrings.title), failureTitle: str_(UIStrings.failureTitle), description: str_(UIStrings.description), requiredArtifacts: ['RobotsTxt'], }; } /** * @param {LH.Artifacts} artifacts * @return {LH.Audit.Product} */ static audit(artifacts) { const { status, content, } = artifacts.RobotsTxt; if (!status) { return { score: 0, explanation: str_(UIStrings.explanation), }; } if (status >= HTTP_SERVER_ERROR_CODE_LOW) { return { score: 0, displayValue: str_(UIStrings.displayValueHttpBadCode, {statusCode: status}), }; } else if (status >= HTTP_CLIENT_ERROR_CODE_LOW || content === '') { return { score: 1, notApplicable: true, }; } // If status is good, content must be not null. if (content === null) { throw new Error(`Status ${status} was valid, but content was null`); } const validationErrors = validateRobots(content); /** @type {LH.Audit.Details.Table['headings']} */ const headings = [ {key: 'index', valueType: 'text', label: 'Line #'}, {key: 'line', valueType: 'code', label: 'Content'}, {key: 'message', valueType: 'code', label: 'Error'}, ]; const details = Audit.makeTableDetails(headings, validationErrors); let displayValue; if (validationErrors.length) { displayValue = str_(UIStrings.displayValueValidationError, {itemCount: validationErrors.length}); } return { score: Number(validationErrors.length === 0), details, displayValue, }; } } export default RobotsTxt; export {UIStrings};