@ietf-tools/idnits
Version:
Library / CLI to inspect Internet-Draft documents for a variety of conditions to conform with IETF policies.
1,000 lines (885 loc) • 40.9 kB
JavaScript
import { ValidationError } from '../helpers/error.mjs'
import { DateTime } from 'luxon'
import { FQDN_RE } from '../modules/fqdn.mjs'
import { IPV4_LOOSE_RE, IPV6_LOOSE_RE } from '../modules/ip.mjs'
import { rfcStatusHierarchy } from '../config/rfc-status-hierarchy.mjs'
// Regex patterns
const LINE_VALUES_EXTRACT_RE = /^(?<left>.*)\s{2,}(?<right>.*)$/
const AUTHOR_NAME_RE = /^[a-z]\.\s[a-z]+$/i
const DATE_RE = /^(?:(?<day>[0-9]{1,2})\s)?(?<month>[a-z]{3,})\s(?<year>[0-9]{4})$/i
const SECTION_PATTERN = /^\d+\.\s+.+$/
const SUBSECTION_PATTERN = /^\d+\.\d+\.\s+(.+)$/ig
const TOC_PATTERN = /\.+\s*\d+$/
const RFC_REFERENCE_RE = /\bRFC\s?(\d+)\b|\[RFC(\d+)\]/gi
const DRAFT_REFERENCE_RE_IN_TEXT = /(?<=^|[\s\]])\[(?!(?:RFC\d+|draft-[A-Za-z0-9-]+|I-D\.[A-Za-z0-9-]+))[A-Za-z0-9.-]+\]/gi
const PAGES_RE = /\[Page \d+\]$/gm
const COPYRIGHT_NOTICE_RE = /^\d+\.\s*Copyright Notice$/i
const STATUS_OF_THIS_MEMO_RE = /^\d+\.\s*Status of This Memo$/i
const ABSTRACT_RE = /^\d+\.\s*Abstract$/i
const BRACKETED_RFC_REFERENCE_RE = /\[RFC(\d+)\]/
const FIRST_LINE_RE = /^(BM|PK)/
const SPACING_PATTERN = /[A-Za-z][a-z]\s{2,}[a-z]/
const TABLE_OF_CONTENTS = /^Table of Contents\s*$/i
const EXPIRES_FOOTER_RE = /Expires\s+((\d{1,2})?\s?(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4})\s+\[Page\s+\d+\]/gi
const DRAFT_URL_RE = /<https?:\/\/[^>\s]*\b(draft-[A-Za-z0-9-]+)\b[^>\s]*>/gi
// Author regexps
const AUTHORS_OR_EDITORS_ADDRESSES_RE = /^(Authors?|Editors?)[\u2018\u2019\u201B'`"] Addresses$/i
const AUTHOR_INFORMATION_RE = /^[0-9a-z.]*\s*author information$/i
const AUTHOR_CONTACT_INFORMATION_RE = /^[0-9a-z.]*\s*(author|editor)(?:[\u2018\u2019\u201B'`"]s?|s)?\s+contact information$/i
const CONTACT_INFORMATION_RE = /^[0-9a-z.]*\s*contact information$/i
const AUTHOR_EDITORS_RE = /^[0-9a-z.]*\s*(author|editor)s?:?$/i
const AUTHOR_ADDRESS_OPTIONAL_PLURAL_RE = /^(Author|Authors|Editor|Editors)['’`ʼ]s?\s+Address(?:es)?$/i
const AUTHOR_SECTION_RE = new RegExp(
`(${AUTHORS_OR_EDITORS_ADDRESSES_RE.source}|` +
`${AUTHOR_INFORMATION_RE.source}|` +
`${AUTHOR_CONTACT_INFORMATION_RE.source}|` +
`${CONTACT_INFORMATION_RE.source}|` +
`${AUTHOR_EDITORS_RE.source}|` +
`${AUTHOR_ADDRESS_OPTIONAL_PLURAL_RE.source})`,
'i'
)
// Inline code format
const INLINE_CODE_FORMAT = /\/\*|\*\/|^ *#/ig
// Section matchers
const sectionMatchers = [
{ name: 'introduction', regex: /^\d+\.\s+(Introduction|Overview|Background)$/i },
{ name: 'securityConsiderations', regex: /^\d+\.\s+Security Considerations$/i },
{ name: 'authorAddress', regex: AUTHOR_SECTION_RE },
{ name: 'references', regex: /^(?:\d+\.\s+)?(?:(?:Normative|Informative)\s+)?References$/i },
{ name: 'ianaConsiderations', regex: /^\d+\.\s+IANA Considerations$/i }
]
const subsectionMatchers = [
{ name: 'normative_references', regex: /^\d+\.\d+\.\s+Normative\s+References$/i },
{ name: 'informative_references', regex: /^\d+\.\d+\.\s+Informative\s+References$/i },
{ name: 'unclassified_references', regex: /^\d+\.\d+\.\s+([a-zA-Z]+\s+)*Reference(s)?$/i },
{ name: 'appendix', regex: /^Appendix\b/i }
]
const sectionIndentationRules = [
{ name: 'Abstract', regex: /^ +(?:[0123.]+)?[ \t]*abstract$/i },
{ name: 'Introduction', regex: /^ +(?:[0123.]+)?[ \t]*(?:introduction|overview|scope|(?:historical )?background)$/i },
{ name: 'Security Considerations', regex: /^ +(?:[0-9.]+)?[ \t]*security considerations?$/i },
{ name: 'IANA Considerations', regex: /^ +(?:[0-9a-z.]+)?[ \t]*iana considerations?$/i },
{ name: "Author's Addresses", regex: /^ +(?:[0-9A-Z.]*)?[ \t]*(?:author|editor)(?:'s|s')? addresses?$/i },
{ name: 'Status of This Memo', regex: /^ +(?:[0-9.]+)?[ \t]*status of (?:this )?memo/i },
{ name: 'References', regex: /^ +(?:[0-9.]+)?[ \t]*(?:(?:normative|informative)[ \t]+)?references[ \t]*\.?$/i },
{ name: 'Appendix', regex: /^ +appendix[ \t]+[a-z0-9.]+/i }
]
// Boilerplate regex patterns
const BOILERPLATE_PATTERNS = {
rfc2119: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", ?("NOT RECOMMENDED", )?"MAY", and "OPTIONAL" in this document are to be interpreted as described in( BCP 14,)? RFC ?2119[.,;]/ig,
rfc2119_alt: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", ?("NOT RECOMMENDED", )?"MAY", and "OPTIONAL" in this document are to be interpreted as described in "Key words for use in RFCs to Indicate Requirement Levels" \[RFC2119\]/ig,
rfc2119_alt1: /The key words\s+"MUST",\s+"MUST NOT",\s+"REQUIRED",\s+"SHALL",\s+"SHALL NOT",\s+"SHOULD",\s+"SHOULD NOT",\s+"RECOMMENDED",\s+"NOT RECOMMENDED",\s+"MAY",\s+and\s+"OPTIONAL"\s+in this document are to be interpreted as described in\s+\[BCP14\]\s+\(RFC2119\)\s+\(RFC8174\)\s+when, and only when, they appear in all capitals, as shown here\./ig,
rfc2119_alt2: /The key words\s+"MUST",\s+"MUST NOT",\s+"REQUIRED",\s+"SHALL",\s+"SHALL\s+NOT",\s+"SHOULD",\s+"SHOULD NOT",\s+"RECOMMENDED",\s+"NOT RECOMMENDED",\s+"MAY",\s+and\s+"OPTIONAL"\s+in this document are to be interpreted as described in\s+BCP\s*14\s*\[RFC2119\]\[RFC8174\]\s+when, and only when, they appear in all\s+capitals, as shown here[.;]?/ig,
rfc8174: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in BCP 14 \[RFC2119\] \[RFC8174\]/ig,
bcp14: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in\s*\[BCP14\]\s*\(RFC2119\)\s*\(RFC8174\)\s*when, and only when, they appear in all capitals, as shown here\./ig
}
// Similar boilerplate regex pattern
const BOILERPLATE_PARTS = {
rfc2119: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in/g,
/RFC ?2119[.,;]?/i
],
rfc2119_alt: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in/g,
/"Key words for use in RFCs to Indicate Requirement Levels" \[RFC2119\]/gi
],
rfc2119_alt1: [
/The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",/g,
/"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and/g,
/"OPTIONAL" in this document are to be interpreted as described in/g,
/[BCP14] (RFC2119) (RFC8174) when, and only when, they appear in all/g,
/capitals, as shown here./gi
],
rfc2119_alt2: [
/The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",/g,
/"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and/g,
/"OPTIONAL" in this document are to be interpreted as described in BCP/g,
/14 [RFC2119][RFC8174] when, and only when, they appear in all/g,
/capitals, as shown here./gi
],
rfc8174: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in BCP 14/g,
/\[RFC2119\] \[RFC8174\]/ig
],
bcp14: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in\s*\[BCP14\]/g,
/\(RFC2119\)\s*\(RFC8174\)/ig,
/when, and only when, they appear in all capitals, as shown here\./g
]
}
// License declaration
const licenseDeclarations = {
revised_bsd_license: /Code Components extracted from this document must include Revised BSD License text as described in Section 4\.e of the Trust Legal Provisions and are provided without warranty as described in the Revised BSD License\./gi,
previous_tlp4_6_b_i: /Copyright\s+\(c\)\s+\d{4}\s+IETF Trust|Copyright\s+\(C\)\s+\d{4}\s+The Internet Society/gi,
trust_28_dec_2009_section_6_a: /This\s+Internet-Draft\s+is\s+submitted\s+in\s+full\s+conformance\s+with\s+the\s+provisions\s+of\s+BCP\s+78\s+and\s+BCP\s+79/gi,
tlp5_6_b_i_copyright: /Copyright\s+\(c\)\s+(\d{4})\s+IETF Trust\s+and\s+the\s+persons\s+identified\s+as\s+the\s+document\s+authors\.?\s+All\s+rights\s+reserved\.?/gi,
license6_b_i: /This document is subject to BCP 78 and the IETF Trust[’']s Legal Provisions\s+Relating to IETF Documents\s+\(https?:\/\/trustee\.ietf\.org\/license-info\) in effect on the\s+date of publication of this document\. Please review these documents carefully, as\s+they describe your rights and restrictions with respect to this document\. Code\s+Components extracted from this document must include Revised BSD License\s+text as described in Section 4\.e of the Trust Legal Provisions and are provided\s+without warranty as described in the Revised BSD License\./gi,
license6_b_ii: /This document is subject to BCP 78 and the IETF Trust[’']s Legal Provisions\s+Relating to IETF Documents\s+\(https?:\/\/trustee\.ietf\.org\/license-info\) in effect on the\s+date of publication of this document\. Please review these documents carefully, as\s+they describe your rights and restrictions with respect to this document\./gi,
license6_c_i: /This document may not be modified, and derivative works of it may not be\s+created, except to format it for publication as an RFC or to translate it into\s+languages other than English\./gi,
license6_c_ii: /This document may not be modified, and derivative works of it may not be\s+created, and it may not be published except as an Internet-Draft\./gi,
acceptable_paragraph_noting_that_draft: /Internet-Drafts are working documents of the Internet Engineering Task Force \(IETF\)\./gi,
draft_paragraph_out6_month_validity: /Internet-Drafts are draft documents valid for a maximum of six months and may be updated, replaced, or obsoleted by other documents at any time\. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress."\s*/gi,
draft_paragraph_pointing_to_the_list_of_current_ids: /The list of current Internet-Drafts is at https:\/\/datatracker\.ietf\.org\/drafts\/current\/|The list of current Internet-Drafts can be accessed at http:\/\/www\.ietf\.org\/ietf\/1id-abstracts\.txt\./gi
}
// Keywords regex pattern
const KEYWORDS_PATTERN = /((NOT)\s)?(MUST|REQUIRED|SHALL|SHOULD|RECOMMENDED|OPTIONAL|MAY)(\s(NOT))?/g
// Invalid combinations regex pattern
const INVALID_COMBINATIONS_PATTERN = /\s(MUST not|SHALL not|SHOULD not|not RECOMMENDED|MAY NOT|NOT REQUIRED|NOT OPTIONAL)\s/g
// Obsolete and updates regex patterns
const OBSOLETES_RE = /(?:obsoletes|replaces)\s*:\s*((?:rfc\s*)?[0-9]+(?:,|\s|and)*\s*)+/gi
const UPDATES_RE = /updates\s*:\s*((?:rfc\s*)?[0-9]+(?:,|\s|and)*\s*)+/gi
// Consts values
const MAX_PAGE_LENGTH = 58
const TITLE_SECTION_LOOKAHEAD = 3
const HEADER_MAX_LINES = 18
/**
* @typedef {Object} TXTDocObject
* @property {Object} data Parsed TXT tree
* @property {string} docKind Whether the document is an Internet Draft (draft) or an RFC (rfc)
* @property {string} filename Filename of the document
* @property {string} type Document file type (txt)
* @property {number} version Document version number (2 or 3)
* @property {string} versionCertainty Whether the version was explicity specified (strict) or guessed (guess)
*/
/**
* Parse Text document
*
* @param {string} rawText Input text
* @param {string} filename Filename of the document
* @returns {TXTDocObject} Parsed document object
*/
export async function parse (rawText, filename) {
const normalizedText = normalizeText(rawText)
const pages = rawText?.split(/\f/)
const data = {
pageCount: 1,
header: {
authors: [],
date: null,
source: null,
expires: null
},
content: {
abstract: [],
introduction: [],
securityConsiderations: [],
authorAddress: [],
references: [],
ianaConsiderations: []
},
contains: {
copyrightSection6_b_i: false,
copyrightLicenseValid: false,
license6_c_i: false,
license6_c_ii: false,
revisedBsdLicense: false,
codeBlocks: false,
draftParagraphOutSixMonthValidity: false,
acceptableParagraphNotingThatDraft: false,
idIndication: false,
revisedBsdLicense6_i: false,
submissionCompliance: false,
pagesFound: 0,
previous6_b_i_copyright: false
},
title: null,
slug: null,
possibleIssues: {
linesWithSpaces: [],
unexpectedIndentation: [],
inlineCode: [],
misspeled2119Keywords: [],
pageLineWithFormFeed: [],
paragraphPointingToTheListOfCurrentId: [],
copyrightLines6_i: [],
isCopyrightNoticeNumbered: false,
isStatusOfThisMemoNumbered: false,
isAbstractNumbered: false,
isPKorBM: false,
updatesRfcWithLetter: [],
obsoletesWithLetter: [],
missingPageNumbering: [],
submissionCompliancePage: null,
isTableOfContentsExists: false,
tooLongPages: []
},
extractedElements: {
fqdnDomains: [],
ipv4: [],
ipv6: [],
keywords2119: [],
boilerplate2119Keywords: [],
obsoletesRfc: [],
updatesRfc: [],
nonReferenceSectionRfc: [],
referenceSectionRfc: [],
nonReferenceSectionDraftReferences: [],
referenceSectionDraftReferences: [],
draftStatusReferences: [],
copyrightDates: [],
license6_b_ii: [],
license6_b_i: [],
bracketedRfcNonReferences: [],
bracketedRfcReferences: [],
lastPageExpiration: null
},
boilerplate: {
rfc2119: BOILERPLATE_PATTERNS.rfc2119.test(normalizedText) || BOILERPLATE_PATTERNS.rfc2119_alt.test(normalizedText) || BOILERPLATE_PATTERNS.rfc2119_alt1.test(normalizedText) || BOILERPLATE_PATTERNS.rfc2119_alt2.test(normalizedText),
rfc8174: BOILERPLATE_PATTERNS.rfc8174.test(normalizedText),
bcp14: BOILERPLATE_PATTERNS.bcp14.test(normalizedText),
similar2119boilerplate: false
},
references: {
rfc2119: false,
rfc8174: false,
bcp14: false
}
}
let docKind = null
let lineIdx = 0
let currentSection = null
let currentSubSection = null
let inCodeBlock = false
let rfcMatch = null
let draftMatch = null
let prevLine = null
let currentPageLineCount = 0
try {
const markers = {
header: { start: 0, end: 0, lastAuthor: 0, closed: false },
title: 0,
slug: 0,
abstract: { start: 0, end: 0, closed: false },
toc: { start: 0, end: 0, closed: false },
introduction: { start: 0, end: 0, closed: false },
securityConsiderations: { start: 0, end: 0, closed: false },
authorAddress: { start: 0, end: 0, closed: false },
references: { start: 0, end: 0, closed: false },
ianaConsiderations: { start: 0, end: 0, closed: false }
}
// Extracting mentioned rfc2119 keywords
for (const pattern of Object.values(BOILERPLATE_PATTERNS)) {
const match = normalizedText.match(pattern)
if (match) {
const keywordMatches = match[0].matchAll(KEYWORDS_PATTERN)
for (const keywordMatch of keywordMatches) {
const keyword = keywordMatch[0]
if (!data.extractedElements.boilerplate2119Keywords.includes(keyword)) {
data.extractedElements.boilerplate2119Keywords.push(keyword)
}
}
}
}
// Searching acceptable paragraph pointing to the list of current ids
const matchParagraphPointingToTheListOfCurrentId = [...normalizedText.matchAll(licenseDeclarations.draft_paragraph_pointing_to_the_list_of_current_ids)]
if (matchParagraphPointingToTheListOfCurrentId.length) {
data.possibleIssues.paragraphPointingToTheListOfCurrentId = matchParagraphPointingToTheListOfCurrentId.map((item) => item[0])
}
data.boilerplate.similar2119boilerplate =
hasBoilerplateMatch(normalizedText, BOILERPLATE_PARTS.rfc2119, BOILERPLATE_PARTS.rfc2119_alt, BOILERPLATE_PARTS.rfc2119_alt1, BOILERPLATE_PARTS.rfc2119_alt2, BOILERPLATE_PARTS.rfc8174, BOILERPLATE_PARTS.bcp14) &&
!(data.boilerplate.rfc2119 || data.boilerplate.rfc8174 || data.boilerplate.bcp14)
// Extracting obsolete and updated rfc from header
const obsoletesRfc = extractRfcNumbers(normalizedText, OBSOLETES_RE)
data.extractedElements.obsoletesRfc.push(...obsoletesRfc.plainNumbers)
data.possibleIssues.obsoletesWithLetter.push(...obsoletesRfc.rfcWithPrefix)
const updatesRfcs = extractRfcNumbers(normalizedText, UPDATES_RE)
data.extractedElements.updatesRfc.push(...updatesRfcs.plainNumbers)
data.possibleIssues.updatesRfcWithLetter.push(...updatesRfcs.rfcWithPrefix)
// Searching for license declaration
data.contains.revisedBsdLicense = licenseDeclarations.revised_bsd_license.test(normalizedText)
// Page separator counting
const pagesMatches = rawText.match(PAGES_RE)
data.contains.pagesFound = pagesMatches?.length ? pagesMatches.length : 0
// Searching for submission compliance line and pages
for (let i = 0; i < pages.length; i++) {
licenseDeclarations.trust_28_dec_2009_section_6_a.lastIndex = 0
const normalizedPage = pages[i].replace(/\s+/g, ' ').trim()
if (licenseDeclarations.trust_28_dec_2009_section_6_a.test(normalizedPage)) {
data.contains.submissionCompliance = true
data.possibleIssues.submissionCompliancePage = i + 1
break
}
}
// Searching acceptable paragraph noting that IDs are working documents
data.contains.acceptableParagraphNotingThatDraft = licenseDeclarations.acceptable_paragraph_noting_that_draft.test(normalizedText)
// Extracting expiration date from the last page
const lastPageExpiration = [...normalizedText.matchAll(EXPIRES_FOOTER_RE)].pop()
if (lastPageExpiration) {
const expiresStr = lastPageExpiration[1]
const parsed = DateTime.fromFormat(expiresStr, 'd LLLL yyyy') ||
DateTime.fromFormat(expiresStr, 'LLLL d, yyyy')
if (parsed.isValid) {
data.extractedElements.lastPageExpiration = parsed
}
}
// Searching for copyright line in normalized text
const match = [...normalizedText.matchAll(licenseDeclarations.tlp5_6_b_i_copyright)]
if (match.length > 0) {
const copyrightText = match.map(m => m[0])
data.possibleIssues.copyrightLines6_i = copyrightText
if (copyrightText.length) data.contains.copyrightSection6_b_i = true
const copyrightYears = match.map(m => Number(m[1]) || Number(m[2]))
if (copyrightYears.length) data.extractedElements.copyrightDates = copyrightYears
}
// Search for 6.b license declaration
const match6bi = [...normalizedText.matchAll(licenseDeclarations.license6_b_i)]
// Search for 6.b.ii license declaration
const match6bii = [...normalizedText.matchAll(licenseDeclarations.license6_b_ii)]
if (match6bii.length > 0) {
data.extractedElements.license6_b_ii = match6bii.map(m => m[0])
}
if (match6bi.length > 0) {
data.contains.revisedBsdLicense6_i = true
data.extractedElements.license6_b_i = match6bi.map(m => m[0])
}
// Serach for 6.c licenses declaration
data.contains.license6_c_i = licenseDeclarations.license6_c_i.test(normalizedText)
data.contains.license6_c_ii = licenseDeclarations.license6_c_ii.test(normalizedText)
// Searching for copyright line
data.contains.previous6_b_i_copyright = licenseDeclarations.previous_tlp4_6_b_i.test(normalizedText)
// Searching acceptable paragraph calling out 6 month validity
data.contains.draftParagraphOutSixMonthValidity = licenseDeclarations.draft_paragraph_out6_month_validity.test(normalizedText)
for (const line of rawText.split('\n')) {
const trimmedLine = line.trim()
lineIdx++
// Pages not numbered
if (prevLine && !prevLine.includes('[Page') && line.includes('\f')) {
data.possibleIssues.missingPageNumbering.push({ page: data.pageCount, lines: lineIdx })
}
prevLine = line
// Page Break
// --------------------------------------------------------------
if (line.indexOf('\f') >= 0) {
data.pageCount++
if (line.includes('[Page') && line.includes('\f')) {
data.possibleIssues.pageLineWithFormFeed.push({ page: data.pageCount - 1, lines: lineIdx })
}
if (currentPageLineCount > MAX_PAGE_LENGTH) {
data.possibleIssues.tooLongPages.push({ page: data.pageCount - 1, lines: currentPageLineCount })
}
currentPageLineCount = 0
continue
} else {
currentPageLineCount++
}
// Empty line
// --------------------------------------------------------------
if (!trimmedLine) {
continue
}
// Check line spaces
if (SPACING_PATTERN.test(trimmedLine) && !trimmedLine.trim().startsWith('Internet.Draft') && !trimmedLine.trim().startsWith('INTERNET.DRAFT')) {
data.possibleIssues.linesWithSpaces.push({ line: lineIdx, pos: line.length })
}
// Code block detection
if (/<CODE BEGINS>/i.test(trimmedLine)) {
data.contains.codeBlocks = true
inCodeBlock = true
}
if (/<CODE ENDS>/i.test(trimmedLine)) {
inCodeBlock = false
}
// Check for inline code format outside code blocks
if (!inCodeBlock) {
const match = INLINE_CODE_FORMAT.exec(line)
if (match) {
data.possibleIssues.inlineCode.push({
line: lineIdx,
pos: ++match.index
})
}
}
// Search for bad section title identations. Starts search after detecting abstract section to avoid false positives
if (currentSection !== 'toc') {
sectionIndentationRules.forEach(({ name, regex }) => {
if (regex.test(line)) {
data.possibleIssues.unexpectedIndentation.push({ name, line: lineIdx, pos: 0 })
}
})
}
// Extract rfc references from whole text except the reference section
while ((rfcMatch = RFC_REFERENCE_RE.exec(trimmedLine)) !== null) {
const rfcNumber = rfcMatch[1] || rfcMatch[2]
if (currentSection !== 'references' || currentSubSection === 'appendix') {
if (rfcNumber && !data.extractedElements.nonReferenceSectionRfc.includes(rfcNumber)) {
data.extractedElements.nonReferenceSectionRfc.push(rfcNumber)
}
} else if (BRACKETED_RFC_REFERENCE_RE.exec(trimmedLine)) {
if (rfcNumber && !data.extractedElements.referenceSectionRfc.find((el) => el.value === rfcNumber)) {
data.extractedElements.referenceSectionRfc.push({
value: rfcNumber,
subsection: currentSubSection
})
}
}
// Detect bracketed RFC references
if (rfcMatch[0]) {
if (BRACKETED_RFC_REFERENCE_RE.test(rfcMatch[0])) {
if (currentSection === 'references') {
data.extractedElements.bracketedRfcReferences.push(rfcMatch[0])
} else {
data.extractedElements.bracketedRfcNonReferences.push(rfcMatch[0])
}
}
}
}
// Detect draft references
while ((draftMatch = DRAFT_REFERENCE_RE_IN_TEXT.exec(trimmedLine)) !== null) {
const draftName = draftMatch[0]
if (currentSection !== 'references' || currentSubSection === 'appendix') {
if (!data.extractedElements.nonReferenceSectionDraftReferences.includes(draftName)) {
data.extractedElements.nonReferenceSectionDraftReferences.push(draftName)
}
} else {
if (!data.extractedElements.referenceSectionDraftReferences.find((el) => el.value === draftName)) {
data.extractedElements.referenceSectionDraftReferences.push({
value: draftName,
subsection: currentSubSection
})
}
}
}
if (STATUS_OF_THIS_MEMO_RE.test(trimmedLine)) {
data.possibleIssues.isStatusOfThisMemoNumbered = true
}
if (FIRST_LINE_RE.test(trimmedLine) && lineIdx === 1) {
data.possibleIssues.isPKorBM = true
}
if (TABLE_OF_CONTENTS.test(trimmedLine)) {
data.possibleIssues.isTableOfContentsExists = true
}
// Check for references
if (/\[RFC2119\]/i.test(trimmedLine)) {
data.references.rfc2119 = true
}
if (/\[RFC8174\]/i.test(trimmedLine)) {
data.references.rfc8174 = true
}
if (/\[BCP14\]/i.test(trimmedLine)) {
data.references.bcp14 = true
}
// Check for keywords
const keywordMatches = [...trimmedLine.matchAll(KEYWORDS_PATTERN)]
keywordMatches.forEach(match => {
data.extractedElements.keywords2119.push({ keyword: match[0], line: lineIdx })
})
// Check for invalid keyword combinations
const invalidMatches = [...line.matchAll(INVALID_COMBINATIONS_PATTERN)]
invalidMatches.forEach(match => {
data.possibleIssues.misspeled2119Keywords.push({ invalidKeyword: match[0], line: lineIdx, pos: ++match.index })
})
// FQDN Domain extraction
const domainMatches = [...trimmedLine.matchAll(FQDN_RE)]
if (domainMatches.length > 0) {
domainMatches.forEach(match => {
if (!data.extractedElements.fqdnDomains.includes(match.groups.domain)) {
data.extractedElements.fqdnDomains.push(match.groups.domain.toLowerCase())
}
})
}
// IPv4 and IPv6 extraction
const ipv4Matches = [...trimmedLine.matchAll(IPV4_LOOSE_RE)]
if (ipv4Matches.length > 0) {
ipv4Matches.forEach(match => data.extractedElements.ipv4.push(match[0]))
}
const ipv6Matches = [...trimmedLine.matchAll(IPV6_LOOSE_RE)]
if (ipv6Matches.length > 0) {
ipv6Matches.forEach(match => data.extractedElements.ipv6.push(match[0]))
}
// Header
// --------------------------------------------------------------
if (!markers.header.start) {
// -> First Line
markers.header.start = lineIdx
markers.header.end = lineIdx
const values = LINE_VALUES_EXTRACT_RE.exec(trimmedLine)
// --> Source
data.header.source = values?.groups.left
// --> Author
data.header.authors.push({
name: values?.groups.right
})
markers.header.lastAuthor = lineIdx
continue
} else if (!markers.header.closed) {
if (lineIdx > markers.header.end + 1) {
markers.header.closed = true
markers.title = lineIdx
data.title = trimmedLine
} else {
markers.header.end = lineIdx
const extractedValues = LINE_VALUES_EXTRACT_RE.exec(line)
const values = extractedValues ? extractedValues.groups : { left: trimmedLine, right: null }
if (values.left) {
// --> Date
const match = values.left.match(DATE_RE)
if (match) {
const day = parseInt(match[1], 10)
const month = match[2]
const year = parseInt(match[3], 10)
data.header.date = { day, month, year }
}
// --> Document Kind
if (values.left.includes('Internet-Draft')) {
docKind = 'draft'
data.contains.idIndication = true
} else if (values.left.startsWith('Request for Comments')) {
data.header.rfcNumber = values.left.split(':')?.[1]?.trim()
docKind = 'rfc'
} else if (filename.startsWith('rfc')) {
const match = filename.match(/rfc(\d+)\.txt$/i)
if (match) {
data.header.rfcNumber = match[1]
docKind = 'rfc'
}
} else {
docKind = 'draft'
}
// --> Intended status
if (values.left.startsWith('Intended')) {
const rawIntendedStatus = values.left.split(':')?.[1]?.trim()
const cleanIntendedStatus = extractStatusName(rawIntendedStatus)
data.header.intendedStatus = cleanIntendedStatus || rawIntendedStatus
}
// --> Obsoletes
if (values.left.startsWith('Obsoletes')) {
const obsoletesValues = values.left.split(':')?.[1]?.trim()
data.header.obsoletes = obsoletesValues.indexOf(',') >= 0 ? obsoletesValues.split(',').map(o => o.trim()) : [obsoletesValues]
}
// --> Category
if (values.left.startsWith('Category')) {
const rawCategory = values.left.split(':')?.[1]?.trim()
const cleanCategory = extractStatusName(rawCategory)
data.header.category = cleanCategory || rawCategory
}
// --> ISSN
if (values.left.startsWith('ISSN')) {
data.header.issn = values.left.split(':')?.[1]?.trim()
}
// --> Expires
if (values.left.startsWith('Expires')) {
const datePart = values.left.split(':')[1]?.trim().split(/\s{2,}/)[0]
const DATE_RE = /(?:(?<month>[A-Za-z]{3,9})[\s]+(?<day>\d{1,2}),?\s*(?<year>\d{4}))|(?:(?<dayAlt>\d{1,2})[\s\-/]*(?<monthAlt>[A-Za-z]{3,9})[\s\-/,]*(?<yearAlt>\d{4}))|(?<iso>\d{4}-\d{2}-\d{2})/
const dateValue = DATE_RE.exec(datePart)
if (dateValue) {
if (dateValue.groups.iso) {
data.header.expires = DateTime.fromISO(dateValue.groups.iso)
} else {
const day = dateValue.groups.day || dateValue.groups.dayAlt || 1
const month = dateValue.groups.month || dateValue.groups.monthAlt
const year = dateValue.groups.year || dateValue.groups.yearAlt
data.header.expires = DateTime.fromFormat(
`${day} ${month} ${year}`,
'd LLLL yyyy'
)
}
}
}
}
if (values.right) {
// --> Date
const dateValue = DATE_RE.exec(values.right)
if (dateValue) {
const day = parseInt(dateValue[1], 10)
const month = dateValue[2]
const year = parseInt(dateValue[3], 10)
data.header.date = { day, month, year }
}
if (!data.header.date) {
// --> Author
const authorNameValue = AUTHOR_NAME_RE.exec(values.right)
if (authorNameValue) {
// --> Blank line = Previous author(s) have no affiliation
if (lineIdx > markers.header.lastAuthor + 1) {
data.header.authors.findLast(el => {
if (el.org || el.org === '') {
return true
} else {
el.org = ''
return false
}
})
}
// --> Author Name
data.header.authors.push({
name: authorNameValue[0]
})
} else if (values.right) {
// --> Author Org
data.header.authors.findLast(el => {
if (el.org || el.org === '') {
return true
} else {
el.org = values.right
return false
}
})
}
markers.header.lastAuthor = lineIdx
}
}
}
}
if ((data.title && lineIdx <= markers.title + TITLE_SECTION_LOOKAHEAD) || (!data.title && lineIdx < HEADER_MAX_LINES)) {
if (trimmedLine.startsWith('draft-')) {
markers.slug = lineIdx
data.slug = trimmedLine
continue
}
}
if (COPYRIGHT_NOTICE_RE.test(trimmedLine)) {
data.possibleIssues.isCopyrightNoticeNumbered = true
}
// Abstract
// --------------------------------------------------------------
if (trimmedLine === 'Abstract' || ABSTRACT_RE.test(trimmedLine)) {
markers.abstract.start = lineIdx
currentSection = 'abstract'
data.content.abstract = []
if (ABSTRACT_RE.test(trimmedLine)) {
data.possibleIssues.isAbstractNumbered = true
}
} else if (markers.abstract.start && !markers.abstract.closed) {
if (trimmedLine.startsWith('Status of') || !line.startsWith(' ')) {
markers.abstract.end = lineIdx - 1
markers.abstract.closed = true
}
}
// Header
// --------------------------------------------------------------
if (!markers.header.start) {
markers.header.start = lineIdx
markers.header.end = lineIdx
const values = LINE_VALUES_EXTRACT_RE.exec(trimmedLine)
if (values) {
data.header.source = values.groups.left
data.header.authors.push({ name: values.groups.right })
}
markers.header.lastAuthor = lineIdx
continue
} else if (!markers.header.closed) {
if (lineIdx > markers.header.end + 1) {
markers.header.closed = true
markers.title = lineIdx
data.title = !trimmedLine.startsWith('draft-') ? trimmedLine : null
} else {
markers.header.end = lineIdx
}
continue
}
// Table of Contents
// --------------------------------------------------------------
if (!markers.toc.start && line === 'Table of Contents') {
markers.toc.start = lineIdx
currentSection = 'toc'
continue
} else if (currentSection === 'toc' && !markers.toc.closed) {
if (!TOC_PATTERN.test(line)) {
markers.toc.closed = true
markers.toc.end = lineIdx - 1
currentSection = null
} else {
continue
}
}
// Other Sections
// --------------------------------------------------------------
// Section detection and content assignment
if ((sectionMatchers.some(({ regex }) => regex.test(trimmedLine)) || AUTHOR_SECTION_RE.test(trimmedLine)) && !ABSTRACT_RE.test(trimmedLine)) {
const matchedSection = sectionMatchers.find(({ regex }) => regex.test(trimmedLine))
if (currentSection && !markers[currentSection].closed) {
markers[currentSection].end = lineIdx - 1
markers[currentSection].closed = true
}
if (matchedSection) {
currentSection = matchedSection.name
markers[currentSection].start = lineIdx
markers[currentSection].closed = false
} else {
currentSection = null
}
} else if (SECTION_PATTERN.test(trimmedLine) && !TOC_PATTERN.test(trimmedLine)) {
if (currentSection && !markers[currentSection].closed) {
markers[currentSection].end = lineIdx - 1
markers[currentSection].closed = true
}
currentSection = null
}
// Sub section detection
if (SUBSECTION_PATTERN.test(trimmedLine) && !TOC_PATTERN.test(trimmedLine)) {
if (subsectionMatchers.some(({ regex }) => regex.test(trimmedLine))) {
const matchedSubsection = subsectionMatchers.find(({ regex }) => regex.test(trimmedLine))
currentSubSection = matchedSubsection.name
} else currentSubSection = null
} else if (!TOC_PATTERN.test(trimmedLine)) {
if (subsectionMatchers.some(({ regex }) => regex.test(trimmedLine))) {
const matchedSubsection = subsectionMatchers.find(({ regex }) => regex.test(trimmedLine))
currentSubSection = matchedSubsection.name
}
}
// Add content to the current section
if (currentSection && markers[currentSection].start && !markers[currentSection].closed) {
data.content[currentSection].push(trimmedLine)
}
}
// Close the last section
if (currentSection && !markers[currentSection].closed) {
markers[currentSection].end = lineIdx
markers[currentSection].closed = true
}
data.markers = markers
// Extracting draft references from reference section
const refLines = data.content.references
const draftsBySub = extractDraftsBySubsection(refLines)
data.extractedElements.draftStatusReferences.push(...draftsBySub)
} catch (err) {
throw new ValidationError('TXT_PARSING_FAILED', `Error while parsing Line ${lineIdx}: ${err.message}`)
}
return {
docKind,
body: rawText,
data,
filename,
type: 'txt'
}
}
/**
* Function to check if at least one match is found among the specified groups of patterns in the text
*
* @param {string} text Normalized text
* @param {...Array<RegExp>} regexGroups Arrays of patterns to check
* @returns {boolean} Whether at least one match is found
*/
function hasBoilerplateMatch (text, ...regexGroups) {
for (const group of regexGroups) {
let matchCount = 0
for (const part of group) {
if (part.test(text)) {
matchCount++
} else {
break
}
}
if (matchCount > 0) {
return true
}
}
return false
}
/**
* Extract RFC numbers from the text
*
* @param {string} text Text to extract RFC numbers from
* @param {RegExp} regex Regular expression to extract RFC numbers
* @returns {Array<string>} Extracted RFC numbers
*/
function extractRfcNumbers (text, regex) {
const matches = {
rfcWithPrefix: [],
plainNumbers: []
}
let match
while ((match = regex.exec(text)) !== null) {
const rfcList = match[0]
if (rfcList) {
const numbers = rfcList
.match(/\b(RFC\s*[0-9]+|[0-9]+)\b/gi)
?.map(num => num.trim()) || []
numbers.forEach(num => {
if (/^RFC\s*[0-9]+$/i.test(num)) {
matches.rfcWithPrefix.push(num)
matches.plainNumbers.push(num.replace(/^RFC\s*/i, ''))
} else {
matches.plainNumbers.push(num)
}
})
}
}
return matches
}
/**
* Extracts the clean status name from a given status text using predefined regular expressions.
*
* This function iterates through an array of predefined RFC statuses, each containing
* a name, regex pattern, and category. It tests the given status text against each regex
* and returns the corresponding clean status name if a match is found.
*
* @param {string} statusText - The raw status text to be processed (e.g., "Standards Track Juniper Networks").
* @returns {string|null} - The clean name of the status (e.g., "Proposed Standard") if matched,
* or `null` if no matching status is found.
*
* Example:
* const rawStatus = "Standards Track Juniper Networks";
* const cleanStatus = extractStatusName(rawStatus);
* console.log(cleanStatus); // Output: "Proposed Standard"
*/
function extractStatusName (statusText) {
for (const status of rfcStatusHierarchy) {
if (status.regex.test(statusText)) {
return status.name
}
}
return null
}
/**
* Normalize text from the form of multi-line text with \n to the form of a single-line long string
*
* @param {string} rawText
* @returns
*/
function normalizeText (rawText) {
let rawFixed = rawText
.replace(/\r?\n/g, '\n')
.replace(/-\s*\n\s*/g, '-')
.replace(/\/\s*\n\s*/g, '/')
.replace(/\f/g, '')
rawFixed = rawFixed.replace(
/^.*\[Page\s*\d+\].*\r?\n(?:[ \t]*\r?\n)*^.*\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b\s+\d{4}.*\r?\n/gm,
''
)
return rawFixed.replace(/\s+/g, ' ').trim()
}
function groupBySubsection (refLines) {
const groups = {}
let current = null
for (const line of refLines) {
const m = subsectionMatchers.find(({ regex }) => regex.test(line))
if (m) {
current = m.name
groups[current] = []
continue
}
if (current) {
groups[current].push(line)
}
}
return groups
}
function extractDraftsBySubsection (refLines) {
const bySub = groupBySubsection(refLines)
const result = []
for (const [subsection, lines] of Object.entries(bySub)) {
const txt = normalizeText(lines.join('\n'))
const drafts = Array.from(txt.matchAll(DRAFT_URL_RE), m => m[1])
for (const draft of new Set(drafts)) {
result.push({ value: draft, subsection })
}
}
return result
}