@ietf-tools/idnits
Version:
Library / CLI to inspect Internet-Draft documents for a variety of conditions to conform with IETF policies.
991 lines (873 loc) • 39.6 kB
JavaScript
import { ValidationError } from '../helpers/error.mjs'
import { DateTime } from 'luxon'
import { FQDN_RE } from '../modules/fqdn.mjs'
import { IPV4_LOOSE_RE, IPV6_LOOSE_RE } from '../modules/ip.mjs'
import { rfcStatusHierarchy } from '../config/rfc-status-hierarchy.mjs'
// Regex patterns
const LINE_VALUES_EXTRACT_RE = /^(?<left>.*)\s{2,}(?<right>.*)$/
const AUTHOR_NAME_RE = /^[a-z]\.\s[a-z]+$/i
const DATE_RE = /^(?:(?<day>[0-9]{1,2})\s)?(?<month>[a-z]{3,})\s(?<year>[0-9]{4})$/i
const SECTION_PATTERN = /^\d+\.\s+.+$/
const SUBSECTION_PATTERN = /^\d+\.\d+\.\s+(.+)$/ig
const TOC_PATTERN = /\.+\s*\d+$/
const RFC_REFERENCE_RE = /\bRFC\s?(\d+)\b|\[RFC(\d+)\]/gi
const NON_RFC_REFERENCE_RE = /\[(?!RFC\d+)[a-zA-Z0-9-.]+\]/gi
const PAGES_RE = /\[Page \d+\]$/gm
const COPYRIGHT_NOTICE_RE = /^\d+\.\s*Copyright Notice$/i
const STATUS_OF_THIS_MEMO_RE = /^\d+\.\s*Status of This Memo$/i
const ABSTRACT_RE = /^\d+\.\s*Abstract$/i
const BRACKETED_RFC_REFERENCE_RE = /\[RFC(\d+)\]/
const FIRST_LINE_RE = /^(BM|PK)/
const SPACING_PATTERN = /[A-Za-z][a-z]\s{2,}[a-z]/
const TABLE_OF_CONTENTS = /^Table of Contents\s*$/i
const TOC_CONTENT_LINE_RE = /^\s*[A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*\s*(?:.+?\s+)?(?:\.{2,}|\s*\.\s*)+\s*\d+\s*$/
const SECTION_TITLE_WITH_INDENTATION_RE = /^[\s\u00A0]+\d+(?:\.\d+)*\.\s+.+$/u
const REFERENCE_LINE_RE = /\[[^\]]+\]/
const INCORRECT_INDENTATION = /^(?: {0}| {1}| {2}| {4,})(?! )/ // Everything except for three spaces
const SECTION_HEADER_REGEX = /^\d+(?:\.\d+)*\.\s+.+$/
const QUOTE_CLOSE_RE = /[“”‘’"']\s*$/u
const EXPIRES_FOOTER_RE = /Expires\s+((\d{1,2})?\s?(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4})\s+\[Page\s+\d+\]/gi
// Author regexps
const AUTHORS_OR_EDITORS_ADDRESSES_RE = /^(Authors?|Editors?)[\u2018\u2019\u201B'`"] Addresses$/i
const AUTHOR_INFORMATION_RE = /^[0-9a-z.]*\s*author information$/i
const AUTHOR_CONTACT_INFORMATION_RE = /^[0-9a-z.]*\s*(author|editor)(?:[\u2018\u2019\u201B'`"]s?|s)?\s+contact information$/i
const CONTACT_INFORMATION_RE = /^[0-9a-z.]*\s*contact information$/i
const AUTHOR_EDITORS_RE = /^[0-9a-z.]*\s*(author|editor)s?:?$/i
const AUTHOR_ADDRESS_OPTIONAL_PLURAL_RE = /^(Author|Authors|Editor|Editors)['’`ʼ]s?\s+Address(?:es)?$/i
const AUTHOR_SECTION_RE = new RegExp(
`(${AUTHORS_OR_EDITORS_ADDRESSES_RE.source}|` +
`${AUTHOR_INFORMATION_RE.source}|` +
`${AUTHOR_CONTACT_INFORMATION_RE.source}|` +
`${CONTACT_INFORMATION_RE.source}|` +
`${AUTHOR_EDITORS_RE.source}|` +
`${AUTHOR_ADDRESS_OPTIONAL_PLURAL_RE.source})`,
'i'
)
// Unnumbered section titles regex
const UNNUMBERED_SECTION_TITLES_RE = new RegExp(
`^(Abstract|Table of Contents|Copyright Notice|Status of This Memo|IANA Considerations|${AUTHOR_SECTION_RE.source})\\s*$`,
'i'
)
// Inline code format
const INLINE_CODE_FORMAT = /\/\*|\*\/|^ *#/ig
// Section matchers
const sectionMatchers = [
{ name: 'introduction', regex: /^\d+\.\s+(Introduction|Overview|Background)$/i },
{ name: 'securityConsiderations', regex: /^\d+\.\s+Security Considerations$/i },
{ name: 'authorAddress', regex: AUTHOR_SECTION_RE },
{ name: 'references', regex: /^\d+\.\s+(?:(?:Normative|Informative)\s+)?References$/i },
{ name: 'ianaConsiderations', regex: /^\d+\.\s+IANA Considerations$/i }
]
const subsectionMatchers = [
{ name: 'normative_references', regex: /^\d+\.\d+\.\s+Normative\s+References$/i },
{ name: 'informative_references', regex: /^\d+\.\d+\.\s+Informative\s+References$/i },
{ name: 'unclassified_references', regex: /^\d+\.\d+\.\s+([a-zA-Z]+\s+)*Reference(s)?$/i }
]
// Boilerplate regex patterns
const BOILERPLATE_PATTERNS = {
rfc2119: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", ?("NOT RECOMMENDED", )?"MAY", and "OPTIONAL" in this document are to be interpreted as described in( BCP 14,)? RFC ?2119[.,;]/ig,
rfc2119_alt: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", ?("NOT RECOMMENDED", )?"MAY", and "OPTIONAL" in this document are to be interpreted as described in "Key words for use in RFCs to Indicate Requirement Levels" \[RFC2119\]/ig,
rfc8174: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in BCP 14 \[RFC2119\] \[RFC8174\]/ig
}
// Similar boilerplate regex pattern
const BOILERPLATE_PARTS = {
rfc2119: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in/g,
/RFC ?2119[.,;]?/i
],
rfc2119_alt: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in/g,
/"Key words for use in RFCs to Indicate Requirement Levels" \[RFC2119\]/gi
],
rfc8174: [
/The key words /g,
/"MUST", "MUST NOT", "REQUIRED", "SHALL"/g,
/"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g,
/"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g,
/in this document are to be interpreted as described in BCP 14/g,
/\[RFC2119\] \[RFC8174\]/ig
]
}
// License declaration
const licenseDeclarations = {
revised_bsd_license: /Code Components extracted from this document must include Revised BSD License text as described in Section 4\.e of the Trust Legal Provisions and are provided without warranty as described in the Revised BSD License\./gi,
previous_tlp4_6_b_i: /Copyright\s+\(c\)\s+\d{4}\s+IETF Trust|Copyright\s+\(C\)\s+\d{4}\s+The Internet Society/gi,
trust_28_dec_2009_section_6_a: /This\s+Internet-Draft\s+is\s+submitted\s+in\s+full\s+conformance\s+with\s+the\s+provisions\s+of\s+BCP\s+78\s+and\s+BCP\s+79/gi,
tlp5_6_b_i_copyright: /Copyright\s+\(c\)\s+(\d{4})\s+IETF Trust\s+and\s+the\s+persons\s+identified\s+as\s+the\s+document\s+authors\.?\s+All\s+rights\s+reserved\.?/gi,
license6_b_i: /This document is subject to BCP 78 and the IETF Trust[’']s Legal Provisions\s+Relating to IETF Documents\s+\(https?:\/\/trustee\.ietf\.org\/license-info\) in effect on the\s+date of publication of this document\. Please review these documents carefully, as\s+they describe your rights and restrictions with respect to this document\. Code\s+Components extracted from this document must include Revised BSD License\s+text as described in Section 4\.e of the Trust Legal Provisions and are provided\s+without warranty as described in the Revised BSD License\./gi,
license6_b_ii: /This document is subject to BCP 78 and the IETF Trust[’']s Legal Provisions\s+Relating to IETF Documents\s+\(https?:\/\/trustee\.ietf\.org\/license-info\) in effect on the\s+date of publication of this document\. Please review these documents carefully, as\s+they describe your rights and restrictions with respect to this document\./gi,
license6_c_i: /This document may not be modified, and derivative works of it may not be\s+created, except to format it for publication as an RFC or to translate it into\s+languages other than English\./gi,
license6_c_ii: /This document may not be modified, and derivative works of it may not be\s+created, and it may not be published except as an Internet-Draft\./gi,
acceptable_paragraph_noting_that_draft: /Internet-Drafts are working documents of the Internet Engineering Task Force \(IETF\)\./gi,
draft_paragraph_out6_month_validity: /Internet-Drafts are draft documents valid for a maximum of six months and may be updated, replaced, or obsoleted by other documents at any time\. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress."\s*/gi,
draft_paragraph_pointing_to_the_list_of_current_ids: /The list of current Internet-Drafts is at https:\/\/datatracker\.ietf\.org\/drafts\/current\/|The list of current Internet-Drafts can be accessed at http:\/\/www\.ietf\.org\/ietf\/1id-abstracts\.txt\./gi
}
// Keywords regex pattern
const KEYWORDS_PATTERN = /((NOT)\s)?(MUST|REQUIRED|SHALL|SHOULD|RECOMMENDED|OPTIONAL|MAY)(\s(NOT))?/g
// Invalid combinations regex pattern
const INVALID_COMBINATIONS_PATTERN = /(MUST not|SHALL not|SHOULD not|not RECOMMENDED|MAY NOT|NOT REQUIRED|NOT OPTIONAL)/g
// Obsolete and updates regex patterns
const OBSOLETES_RE = /(?:obsoletes|replaces)\s*:\s*((?:rfc\s*)?[0-9]+(?:,|\s|and)*\s*)+/gi
const UPDATES_RE = /updates\s*:\s*((?:rfc\s*)?[0-9]+(?:,|\s|and)*\s*)+/gi
// Consts values
const MAX_PAGE_LENGTH = 58
const TITLE_SECTION_LOOKAHEAD = 3
const HEADER_MAX_LINES = 18
/**
* @typedef {Object} TXTDocObject
* @property {Object} data Parsed TXT tree
* @property {string} docKind Whether the document is an Internet Draft (draft) or an RFC (rfc)
* @property {string} filename Filename of the document
* @property {string} type Document file type (txt)
* @property {number} version Document version number (2 or 3)
* @property {string} versionCertainty Whether the version was explicity specified (strict) or guessed (guess)
*/
/**
* Parse Text document
*
* @param {string} rawText Input text
* @param {string} filename Filename of the document
* @returns {TXTDocObject} Parsed document object
*/
export async function parse (rawText, filename) {
const rawFixed = rawText
.replace(/\r?\n/g, '\n')
.replace(/-\s*\n\s*/g, '-')
.replace(/\/\s*\n\s*/g, '/')
const normalizedText = rawFixed.replace(/\s+/g, ' ').trim()
const pages = rawText?.split(/\f/)
const data = {
pageCount: 1,
header: {
authors: [],
date: null,
source: null,
expires: null
},
content: {
abstract: null,
introduction: null,
securityConsiderations: null,
authorAddress: null,
references: null,
ianaConsiderations: null
},
contains: {
copyrightSection6_b_i: false,
copyrightLicenseValid: false,
license6_c_i: false,
license6_c_ii: false,
revisedBsdLicense: false,
codeBlocks: false,
draftParagraphOutSixMonthValidity: false,
acceptableParagraphNotingThatDraft: false,
idIndication: false,
revisedBsdLicense6_i: false,
submissionCompliance: false,
pagesFound: 0,
previous6_b_i_copyright: false
},
title: null,
slug: null,
possibleIssues: {
hyphenatedLines: [],
linesWithSpaces: [],
unexpectedIndentation: [],
inlineCode: [],
misspeled2119Keywords: [],
pageLineWithFormFeed: [],
paragraphPointingToTheListOfCurrentId: [],
copyrightLines6_i: [],
isCopyrightNoticeNumbered: false,
isStatusOfThisMemoNumbered: false,
isAbstractNumbered: false,
isPKorBM: false,
updatesRfcWithLetter: [],
obsoletesWithLetter: [],
missingPageNumbering: [],
submissionCompliancePage: null,
isTableOfContentsExists: false,
tooLongPages: []
},
extractedElements: {
fqdnDomains: [],
ipv4: [],
ipv6: [],
keywords2119: [],
boilerplate2119Keywords: [],
obsoletesRfc: [],
updatesRfc: [],
nonReferenceSectionRfc: [],
referenceSectionRfc: [],
nonReferenceSectionDraftReferences: [],
referenceSectionDraftReferences: [],
copyrightDates: [],
license6_b_ii: [],
license6_b_i: [],
bracketedRfcNonReferences: [],
bracketedRfcReferences: [],
lastPageExpiration: null
},
boilerplate: {
rfc2119: BOILERPLATE_PATTERNS.rfc2119.test(normalizedText) || BOILERPLATE_PATTERNS.rfc2119_alt.test(normalizedText),
rfc8174: BOILERPLATE_PATTERNS.rfc8174.test(normalizedText),
similar2119boilerplate: false
},
references: {
rfc2119: false,
rfc8174: false
}
}
let docKind = null
let lineIdx = 0
let currentSection = null
let currentSubSection = null
let inCodeBlock = false
let rfcMatch = null
let draftMatch = null
let prevLine = null
let isQuote = false
let isContinuation = false
let isPageSeparator = false
let isTableOfContentSection = false
let currentPageLineCount = 0
try {
const markers = {
header: { start: 0, end: 0, lastAuthor: 0, closed: false },
title: 0,
slug: 0,
abstract: { start: 0, end: 0, closed: false },
introduction: { start: 0, end: 0, closed: false },
securityConsiderations: { start: 0, end: 0, closed: false },
authorAddress: { start: 0, end: 0, closed: false },
references: { start: 0, end: 0, closed: false },
ianaConsiderations: { start: 0, end: 0, closed: false }
}
const cleanedNormalized = removeBoilerplates(normalizedText)
const candidateFragments = extractCandidateHyphenFragments(cleanedNormalized)
const hyphenIssues = findHyphenIssuesInRaw(rawText, candidateFragments)
data.possibleIssues.hyphenatedLines = data.possibleIssues.hyphenatedLines || []
data.possibleIssues.hyphenatedLines.push(...hyphenIssues)
// Extracting mentioned rfc2119 keywords
for (const pattern of Object.values(BOILERPLATE_PATTERNS)) {
const match = normalizedText.match(pattern)
if (match) {
const keywordMatches = match[0].matchAll(KEYWORDS_PATTERN)
for (const keywordMatch of keywordMatches) {
const keyword = keywordMatch[0]
if (!data.extractedElements.boilerplate2119Keywords.includes(keyword)) {
data.extractedElements.boilerplate2119Keywords.push(keyword)
}
}
}
}
// Searching acceptable paragraph pointing to the list of current ids
const matchParagraphPointingToTheListOfCurrentId = [...normalizedText.matchAll(licenseDeclarations.draft_paragraph_pointing_to_the_list_of_current_ids)]
if (matchParagraphPointingToTheListOfCurrentId.length) {
data.possibleIssues.paragraphPointingToTheListOfCurrentId = matchParagraphPointingToTheListOfCurrentId.map((item) => item[0])
}
data.boilerplate.similar2119boilerplate =
hasBoilerplateMatch(normalizedText, BOILERPLATE_PARTS.rfc2119, BOILERPLATE_PARTS.rfc2119_alt, BOILERPLATE_PARTS.rfc8174) && !(data.boilerplate.rfc2119 || data.boilerplate.rfc8174)
// Extracting obsolete and updated rfc from header
const obsoletesRfc = extractRfcNumbers(normalizedText, OBSOLETES_RE)
data.extractedElements.obsoletesRfc.push(...obsoletesRfc.plainNumbers)
data.possibleIssues.obsoletesWithLetter.push(...obsoletesRfc.rfcWithPrefix)
const updatesRfcs = extractRfcNumbers(normalizedText, UPDATES_RE)
data.extractedElements.updatesRfc.push(...updatesRfcs.plainNumbers)
data.possibleIssues.updatesRfcWithLetter.push(...updatesRfcs.rfcWithPrefix)
// Searching for license declaration
data.contains.revisedBsdLicense = licenseDeclarations.revised_bsd_license.test(normalizedText)
// Page separator counting
const pagesMatches = rawText.match(PAGES_RE)
data.contains.pagesFound = pagesMatches?.length ? pagesMatches.length : 0
// Searching for submission compliance line and pages
for (let i = 0; i < pages.length; i++) {
licenseDeclarations.trust_28_dec_2009_section_6_a.lastIndex = 0
const normalizedPage = pages[i].replace(/\s+/g, ' ').trim()
if (licenseDeclarations.trust_28_dec_2009_section_6_a.test(normalizedPage)) {
data.contains.submissionCompliance = true
data.possibleIssues.submissionCompliancePage = i + 1
break
}
}
// Searching acceptable paragraph noting that IDs are working documents
data.contains.acceptableParagraphNotingThatDraft = licenseDeclarations.acceptable_paragraph_noting_that_draft.test(normalizedText)
// Extracting expiration date from the last page
const lastPageExpiration = [...normalizedText.matchAll(EXPIRES_FOOTER_RE)].pop()
if (lastPageExpiration) {
const expiresStr = lastPageExpiration[1]
const parsed = DateTime.fromFormat(expiresStr, 'd LLLL yyyy') ||
DateTime.fromFormat(expiresStr, 'LLLL d, yyyy')
if (parsed.isValid) {
data.extractedElements.lastPageExpiration = parsed
}
}
// Searching for copyright line in normalized text
const match = [...normalizedText.matchAll(licenseDeclarations.tlp5_6_b_i_copyright)]
if (match.length > 0) {
const copyrightText = match.map(m => m[0])
data.possibleIssues.copyrightLines6_i = copyrightText
if (copyrightText.length) data.contains.copyrightSection6_b_i = true
const copyrightYears = match.map(m => Number(m[1]) || Number(m[2]))
if (copyrightYears.length) data.extractedElements.copyrightDates = copyrightYears
}
// Search for 6.b license declaration
const match6bi = [...normalizedText.matchAll(licenseDeclarations.license6_b_i)]
// Search for 6.b.ii license declaration
const match6bii = [...normalizedText.matchAll(licenseDeclarations.license6_b_ii)]
if (match6bii.length > 0) {
data.extractedElements.license6_b_ii = match6bii.map(m => m[0])
}
if (match6bi.length > 0) {
data.contains.revisedBsdLicense6_i = true
data.extractedElements.license6_b_i = match6bi.map(m => m[0])
}
// Serach for 6.c licenses declaration
data.contains.license6_c_i = licenseDeclarations.license6_c_i.test(normalizedText)
data.contains.license6_c_ii = licenseDeclarations.license6_c_ii.test(normalizedText)
// Searching for copyright line
data.contains.previous6_b_i_copyright = licenseDeclarations.previous_tlp4_6_b_i.test(normalizedText)
for (const line of rawText.split('\n')) {
const trimmedLine = line.trim()
const normalizedLine = line.replace(/\s/g, ' ').replace(/\r\n/g, '\n')
lineIdx++
// Pages not numbered
if (prevLine && !prevLine.includes('[Page') && line.includes('\f')) {
data.possibleIssues.missingPageNumbering.push({ page: data.pageCount, lines: lineIdx })
}
prevLine = line
// Page Break
// --------------------------------------------------------------
if (line.indexOf('\f') >= 0) {
data.pageCount++
if (line.includes('[Page') && line.includes('\f')) {
data.possibleIssues.pageLineWithFormFeed.push({ page: data.pageCount - 1, lines: lineIdx })
}
if (currentPageLineCount > MAX_PAGE_LENGTH) {
data.possibleIssues.tooLongPages.push({ page: data.pageCount - 1, lines: currentPageLineCount })
}
currentPageLineCount = 0
continue
} else {
currentPageLineCount++
}
// Empty line
// --------------------------------------------------------------
if (!trimmedLine) {
isContinuation = false
isPageSeparator = false
continue
}
// Check line spaces
if (SPACING_PATTERN.test(trimmedLine) && !trimmedLine.trim().startsWith('Internet.Draft') && !trimmedLine.trim().startsWith('INTERNET.DRAFT')) {
data.possibleIssues.linesWithSpaces.push({ line: lineIdx, pos: line.length })
}
// Code block detection
if (/<CODE BEGINS>/i.test(trimmedLine)) {
data.contains.codeBlocks = true
inCodeBlock = true
}
if (/<CODE ENDS>/i.test(trimmedLine)) {
inCodeBlock = false
}
// Check for inline code format outside code blocks
if (!inCodeBlock) {
const match = INLINE_CODE_FORMAT.exec(line)
if (match) {
data.possibleIssues.inlineCode.push({
line: lineIdx,
pos: ++match.index
})
}
}
// Search for bad identations. Starts search after detecting abstract section to avoid false positives
if (TABLE_OF_CONTENTS.test(normalizedLine)) { // Detection TOC section to avoid false positives
isTableOfContentSection = true
}
if (isTableOfContentSection && !TOC_CONTENT_LINE_RE.test(trimmedLine) && SECTION_PATTERN.test(trimmedLine)) { // Exiting TOC section when new section is detected
isTableOfContentSection = false
}
if (SECTION_HEADER_REGEX.test(normalizedLine)) {
isContinuation = true
}
if (normalizedLine.includes('[Page')) {
isPageSeparator = true
}
if (markers.abstract.start && !isTableOfContentSection) {
if (SECTION_TITLE_WITH_INDENTATION_RE.test(normalizedLine)) {
data.possibleIssues.unexpectedIndentation.push({ line: lineIdx, pos: 0 })
} else if (INCORRECT_INDENTATION.test(normalizedLine) && !(isContinuation || isQuote || isPageSeparator) && !UNNUMBERED_SECTION_TITLES_RE.test(normalizedLine)) {
data.possibleIssues.unexpectedIndentation.push({ line: lineIdx, pos: 0 })
}
}
if (REFERENCE_LINE_RE.test(normalizedLine) && currentSection === 'references') {
isContinuation = true
}
if (trimmedLine.endsWith(':')) {
isQuote = true
} else if (QUOTE_CLOSE_RE.test(trimmedLine)) {
isQuote = false
}
// Extract rfc references from whole text exept of reference section
while ((rfcMatch = RFC_REFERENCE_RE.exec(trimmedLine)) !== null) {
const rfcNumber = rfcMatch[1] || rfcMatch[2]
if (currentSection !== 'references') {
if (rfcNumber && !data.extractedElements.nonReferenceSectionRfc.includes(rfcNumber)) {
data.extractedElements.nonReferenceSectionRfc.push(rfcNumber)
}
} else if (BRACKETED_RFC_REFERENCE_RE.exec(trimmedLine)) {
if (rfcNumber && !data.extractedElements.referenceSectionRfc.find((el) => el.value === rfcNumber)) {
data.extractedElements.referenceSectionRfc.push({
value: rfcNumber,
subsection: currentSubSection
})
}
}
// Detect bracketed RFC references
if (rfcMatch[0]) {
if (BRACKETED_RFC_REFERENCE_RE.test(rfcMatch[0])) {
if (currentSection === 'references') {
data.extractedElements.bracketedRfcReferences.push(rfcMatch[0])
} else {
data.extractedElements.bracketedRfcNonReferences.push(rfcMatch[0])
}
}
}
}
// Detect draft references
while ((draftMatch = NON_RFC_REFERENCE_RE.exec(trimmedLine)) !== null) {
const draftName = draftMatch[0]
if (currentSection !== 'references') {
if (!data.extractedElements.nonReferenceSectionDraftReferences.includes(draftName)) {
data.extractedElements.nonReferenceSectionDraftReferences.push(draftName)
}
} else {
if (!data.extractedElements.referenceSectionDraftReferences.find((el) => el.value === draftName)) {
data.extractedElements.referenceSectionDraftReferences.push({
value: draftName,
subsection: currentSubSection
})
}
}
}
// Searching acceptable paragraph calling out 6 month validity
data.contains.draftParagraphOutSixMonthValidity = licenseDeclarations.draft_paragraph_out6_month_validity.test(normalizedText)
if (STATUS_OF_THIS_MEMO_RE.test(trimmedLine)) {
data.possibleIssues.isStatusOfThisMemoNumbered = true
}
if (FIRST_LINE_RE.test(trimmedLine) && lineIdx === 1) {
data.possibleIssues.isPKorBM = true
}
if (TABLE_OF_CONTENTS.test(trimmedLine)) {
data.possibleIssues.isTableOfContentsExists = true
}
// Check for references
if (/\[RFC2119\]/i.test(trimmedLine)) {
data.references.rfc2119 = true
}
if (/\[RFC8174\]/i.test(trimmedLine)) {
data.references.rfc8174 = true
}
// Check for keywords
const keywordMatches = [...trimmedLine.matchAll(KEYWORDS_PATTERN)]
keywordMatches.forEach(match => {
data.extractedElements.keywords2119.push({ keyword: match[0], line: lineIdx })
})
// Check for invalid keyword combinations
const invalidMatches = [...line.matchAll(INVALID_COMBINATIONS_PATTERN)]
invalidMatches.forEach(match => {
data.possibleIssues.misspeled2119Keywords.push({ invalidKeyword: match[0], line: lineIdx, pos: ++match.index })
})
// FQRN Domain extraction
const domainMatches = [...trimmedLine.matchAll(FQDN_RE)]
if (domainMatches.length > 0) {
domainMatches.forEach(match => data.extractedElements.fqdnDomains.push(match.groups.domain))
}
// IPv4 and IPv6 extraction
const ipv4Matches = [...trimmedLine.matchAll(IPV4_LOOSE_RE)]
if (ipv4Matches.length > 0) {
ipv4Matches.forEach(match => data.extractedElements.ipv4.push(match[0]))
}
const ipv6Matches = [...trimmedLine.matchAll(IPV6_LOOSE_RE)]
if (ipv6Matches.length > 0) {
ipv6Matches.forEach(match => data.extractedElements.ipv6.push(match[0]))
}
// Header
// --------------------------------------------------------------
if (!markers.header.start) {
// -> First Line
markers.header.start = lineIdx
markers.header.end = lineIdx
const values = LINE_VALUES_EXTRACT_RE.exec(trimmedLine)
// --> Source
data.header.source = values?.groups.left
// --> Author
data.header.authors.push({
name: values?.groups.right
})
markers.header.lastAuthor = lineIdx
continue
} else if (!markers.header.closed) {
if (lineIdx > markers.header.end + 1) {
markers.header.closed = true
markers.title = lineIdx
data.title = trimmedLine
} else {
markers.header.end = lineIdx
const extractedValues = LINE_VALUES_EXTRACT_RE.exec(line)
const values = extractedValues ? extractedValues.groups : { left: trimmedLine, right: null }
if (values.left) {
// --> Date
const match = values.left.match(DATE_RE)
if (match) {
const day = parseInt(match[1], 10)
const month = match[2]
const year = parseInt(match[3], 10)
data.header.date = { day, month, year }
}
// --> Document Kind
if (values.left.includes('Internet-Draft')) {
docKind = 'draft'
data.contains.idIndication = true
} else if (values.left.startsWith('Request for Comments')) {
data.header.rfcNumber = values.left.split(':')?.[1]?.trim()
docKind = 'rfc'
} else if (filename.startsWith('rfc')) {
const match = filename.match(/rfc(\d+)\.txt$/i)
if (match) {
data.header.rfcNumber = match[1]
docKind = 'rfc'
}
} else {
docKind = 'draft'
}
// --> Intended status
if (values.left.startsWith('Intended')) {
const rawIntendedStatus = values.left.split(':')?.[1]?.trim()
const cleanIntendedStatus = extractStatusName(rawIntendedStatus)
data.header.intendedStatus = cleanIntendedStatus || rawIntendedStatus
}
// --> Obsoletes
if (values.left.startsWith('Obsoletes')) {
const obsoletesValues = values.left.split(':')?.[1]?.trim()
data.header.obsoletes = obsoletesValues.indexOf(',') >= 0 ? obsoletesValues.split(',').map(o => o.trim()) : [obsoletesValues]
}
// --> Category
if (values.left.startsWith('Category')) {
const rawCategory = values.left.split(':')?.[1]?.trim()
const cleanCategory = extractStatusName(rawCategory)
data.header.category = cleanCategory || rawCategory
}
// --> ISSN
if (values.left.startsWith('ISSN')) {
data.header.issn = values.left.split(':')?.[1]?.trim()
}
// --> Expires
if (values.left.startsWith('Expires')) {
const datePart = values.left.split(':')[1]?.trim().split(/\s{2,}/)[0]
const DATE_RE = /(?:(?<month>[A-Za-z]{3,9})[\s]+(?<day>\d{1,2}),?\s*(?<year>\d{4}))|(?:(?<dayAlt>\d{1,2})[\s\-/]*(?<monthAlt>[A-Za-z]{3,9})[\s\-/,]*(?<yearAlt>\d{4}))|(?<iso>\d{4}-\d{2}-\d{2})/
const dateValue = DATE_RE.exec(datePart)
if (dateValue) {
if (dateValue.groups.iso) {
data.header.expires = DateTime.fromISO(dateValue.groups.iso)
} else {
const day = dateValue.groups.day || dateValue.groups.dayAlt || 1
const month = dateValue.groups.month || dateValue.groups.monthAlt
const year = dateValue.groups.year || dateValue.groups.yearAlt
data.header.expires = DateTime.fromFormat(
`${day} ${month} ${year}`,
'd LLLL yyyy'
)
}
}
}
}
if (values.right) {
// --> Date
const dateValue = DATE_RE.exec(values.right)
if (dateValue) {
const day = parseInt(dateValue[1], 10)
const month = dateValue[2]
const year = parseInt(dateValue[3], 10)
data.header.date = { day, month, year }
}
if (!data.header.date) {
// --> Author
const authorNameValue = AUTHOR_NAME_RE.exec(values.right)
if (authorNameValue) {
// --> Blank line = Previous author(s) have no affiliation
if (lineIdx > markers.header.lastAuthor + 1) {
data.header.authors.findLast(el => {
if (el.org || el.org === '') {
return true
} else {
el.org = ''
return false
}
})
}
// --> Author Name
data.header.authors.push({
name: authorNameValue[0]
})
} else if (values.right) {
// --> Author Org
data.header.authors.findLast(el => {
if (el.org || el.org === '') {
return true
} else {
el.org = values.right
return false
}
})
}
markers.header.lastAuthor = lineIdx
}
}
}
}
if ((data.title && lineIdx <= markers.title + TITLE_SECTION_LOOKAHEAD) || (!data.title && lineIdx < HEADER_MAX_LINES)) {
if (trimmedLine.startsWith('draft-')) {
markers.slug = lineIdx
data.slug = trimmedLine
continue
}
}
if (COPYRIGHT_NOTICE_RE.test(trimmedLine)) {
data.possibleIssues.isCopyrightNoticeNumbered = true
}
// Abstract
// --------------------------------------------------------------
if (trimmedLine === 'Abstract' || ABSTRACT_RE.test(trimmedLine)) {
markers.abstract.start = lineIdx
currentSection = 'abstract'
data.content.abstract = []
} else if (markers.abstract.start && !markers.abstract.closed) {
if (trimmedLine.startsWith('Status of') || !line.startsWith(' ')) {
markers.abstract.end = lineIdx - 1
markers.abstract.closed = true
}
}
if (!markers.header.start) {
markers.header.start = lineIdx
markers.header.end = lineIdx
const values = LINE_VALUES_EXTRACT_RE.exec(trimmedLine)
if (values) {
data.header.source = values.groups.left
data.header.authors.push({ name: values.groups.right })
}
markers.header.lastAuthor = lineIdx
continue
} else if (!markers.header.closed) {
if (lineIdx > markers.header.end + 1) {
markers.header.closed = true
markers.title = lineIdx
data.title = !trimmedLine.startsWith('draft-') ? trimmedLine : null
} else {
markers.header.end = lineIdx
}
continue
}
// Section detection and content assignment
if ((SECTION_PATTERN.test(trimmedLine) || AUTHOR_SECTION_RE.test(trimmedLine)) && !ABSTRACT_RE.test(trimmedLine)) {
const matchedSection = sectionMatchers.find(({ regex }) => regex.test(trimmedLine))
if (currentSection && !markers[currentSection].closed) {
markers[currentSection].end = lineIdx - 1
markers[currentSection].closed = true
}
if (matchedSection) {
currentSection = matchedSection.name
markers[currentSection].start = lineIdx
data.content[currentSection] = []
} else {
currentSection = null
}
}
// Sub section detection
if (SUBSECTION_PATTERN.test(trimmedLine) && !TOC_PATTERN.test(trimmedLine)) {
if (subsectionMatchers.some(({ regex }) => regex.test(trimmedLine))) {
const matchedSubsection = subsectionMatchers.find(({ regex }) => regex.test(trimmedLine))
currentSubSection = matchedSubsection.name
} else currentSubSection = null
}
// Add content to the current section
if (currentSection && markers[currentSection].start && !markers[currentSection].closed) {
data.content[currentSection].push(trimmedLine)
}
}
// Close the last section
if (currentSection && !markers[currentSection].closed) {
markers[currentSection].end = lineIdx
markers[currentSection].closed = true
}
if (data.content?.abstract?.length) {
const firstLine = data.content.abstract[0].trim()
if (ABSTRACT_RE.test(firstLine)) {
data.possibleIssues.isAbstractNumbered = true
}
}
data.markers = markers
} catch (err) {
throw new ValidationError('TXT_PARSING_FAILED', `Error while parsing Line ${lineIdx}: ${err.message}`)
}
return {
docKind,
body: rawText,
data,
filename,
type: 'txt'
}
}
/**
* Function to check if at least one match is found among the specified groups of patterns in the text
*
* @param {string} text Normalized text
* @param {...Array<RegExp>} regexGroups Arrays of patterns to check
* @returns {boolean} Whether at least one match is found
*/
function hasBoilerplateMatch (text, ...regexGroups) {
for (const group of regexGroups) {
let matchCount = 0
for (const part of group) {
if (part.test(text)) {
matchCount++
} else {
break
}
}
if (matchCount > 0) {
return true
}
}
return false
}
/**
* Extract RFC numbers from the text
*
* @param {string} text Text to extract RFC numbers from
* @param {RegExp} regex Regular expression to extract RFC numbers
* @returns {Array<string>} Extracted RFC numbers
*/
function extractRfcNumbers (text, regex) {
const matches = {
rfcWithPrefix: [],
plainNumbers: []
}
let match
while ((match = regex.exec(text)) !== null) {
const rfcList = match[0]
if (rfcList) {
const numbers = rfcList
.match(/\b(RFC\s*[0-9]+|[0-9]+)\b/gi)
?.map(num => num.trim()) || []
numbers.forEach(num => {
if (/^RFC\s*[0-9]+$/i.test(num)) {
matches.rfcWithPrefix.push(num)
matches.plainNumbers.push(num.replace(/^RFC\s*/i, ''))
} else {
matches.plainNumbers.push(num)
}
})
}
}
return matches
}
/**
* Extracts the clean status name from a given status text using predefined regular expressions.
*
* This function iterates through an array of predefined RFC statuses, each containing
* a name, regex pattern, and category. It tests the given status text against each regex
* and returns the corresponding clean status name if a match is found.
*
* @param {string} statusText - The raw status text to be processed (e.g., "Standards Track Juniper Networks").
* @returns {string|null} - The clean name of the status (e.g., "Proposed Standard") if matched,
* or `null` if no matching status is found.
*
* Example:
* const rawStatus = "Standards Track Juniper Networks";
* const cleanStatus = extractStatusName(rawStatus);
* console.log(cleanStatus); // Output: "Proposed Standard"
*/
function extractStatusName (statusText) {
for (const status of rfcStatusHierarchy) {
if (status.regex.test(statusText)) {
return status.name
}
}
return null
}
/**
* Removes boilerplate sections from the provided text using the regular expressions
* defined in licenseDeclarations.
*
* @param {string} text - The input text.
* @returns {string} - The text with boilerplate sections removed.
*/
function removeBoilerplates (text) {
let cleaned = text
for (const key in licenseDeclarations) {
cleaned = cleaned.replace(licenseDeclarations[key], '')
}
return cleaned
}
/**
* Escapes special regex characters in a string to safely use it in a regular expression.
*
* @param {string} str - The input string.
* @returns {string} - The escaped string.
*/
function escapeRegExp (str) {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
}
/**
* Extracts candidate hyphen fragments from the cleaned text.
* It looks for a pattern where a word (prevWord) is followed by whitespace and then
* another word ending with a hyphen (fragment), ensuring that a letter follows the hyphen.
*
* @param {string} cleanedText - The text after boilerplate sections have been removed.
* @returns {Array<Object>} - An array of objects with properties:
* - prevWord: the word preceding the hyphenated fragment,
* - fragment: the hyphenated fragment (word ending with a hyphen).
*/
function extractCandidateHyphenFragments (cleanedText) {
const pattern = /(\b[^\s-]+)\s+([^\s-]+-)(?=[A-Za-zА-Яа-яІіЇїЄє])/g
const candidates = []
let match
while ((match = pattern.exec(cleanedText)) !== null) {
candidates.push({ prevWord: match[1], fragment: match[2] })
}
return candidates
}
/**
* Finds hyphenation issues in the raw text using the candidate fragments.
* For each candidate, it creates a regex to locate an exact sequence where the candidate
* (a previous word followed by whitespace and a hyphenated fragment) appears at the end of a line.
* If the line ends with a hyphen and contains the exact sequence, it is recorded as an issue.
*
* @param {string} rawText - The original raw text.
* @param {Array<Object>} candidateFragments - Array of candidate objects with properties prevWord and fragment.
* @returns {Array<Object>} - An array of issue objects, each containing:
* - line: the line number in which the issue was found,
* - pos: the position (length) of the line.
*/
function findHyphenIssuesInRaw (rawText, candidateFragments) {
const issues = []
const lines = rawText.split('\n')
candidateFragments.forEach(candidate => {
const regex = new RegExp('\\b' + escapeRegExp(candidate.prevWord) + '\\s+' + escapeRegExp(candidate.fragment) + '(?![A-Za-zА-Яа-яІіЇїЄє])')
lines.forEach((line, index) => {
if (line.trim().endsWith('-') && regex.test(line)) {
issues.push({ line: index + 1, pos: line.length })
}
})
})
return issues
}