UNPKG

@ietf-tools/idnits

Version:

Library / CLI to inspect Internet-Draft documents for a variety of conditions to conform with IETF policies.

991 lines (873 loc) 39.6 kB
import { ValidationError } from '../helpers/error.mjs' import { DateTime } from 'luxon' import { FQDN_RE } from '../modules/fqdn.mjs' import { IPV4_LOOSE_RE, IPV6_LOOSE_RE } from '../modules/ip.mjs' import { rfcStatusHierarchy } from '../config/rfc-status-hierarchy.mjs' // Regex patterns const LINE_VALUES_EXTRACT_RE = /^(?<left>.*)\s{2,}(?<right>.*)$/ const AUTHOR_NAME_RE = /^[a-z]\.\s[a-z]+$/i const DATE_RE = /^(?:(?<day>[0-9]{1,2})\s)?(?<month>[a-z]{3,})\s(?<year>[0-9]{4})$/i const SECTION_PATTERN = /^\d+\.\s+.+$/ const SUBSECTION_PATTERN = /^\d+\.\d+\.\s+(.+)$/ig const TOC_PATTERN = /\.+\s*\d+$/ const RFC_REFERENCE_RE = /\bRFC\s?(\d+)\b|\[RFC(\d+)\]/gi const NON_RFC_REFERENCE_RE = /\[(?!RFC\d+)[a-zA-Z0-9-.]+\]/gi const PAGES_RE = /\[Page \d+\]$/gm const COPYRIGHT_NOTICE_RE = /^\d+\.\s*Copyright Notice$/i const STATUS_OF_THIS_MEMO_RE = /^\d+\.\s*Status of This Memo$/i const ABSTRACT_RE = /^\d+\.\s*Abstract$/i const BRACKETED_RFC_REFERENCE_RE = /\[RFC(\d+)\]/ const FIRST_LINE_RE = /^(BM|PK)/ const SPACING_PATTERN = /[A-Za-z][a-z]\s{2,}[a-z]/ const TABLE_OF_CONTENTS = /^Table of Contents\s*$/i const TOC_CONTENT_LINE_RE = /^\s*[A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*\s*(?:.+?\s+)?(?:\.{2,}|\s*\.\s*)+\s*\d+\s*$/ const SECTION_TITLE_WITH_INDENTATION_RE = /^[\s\u00A0]+\d+(?:\.\d+)*\.\s+.+$/u const REFERENCE_LINE_RE = /\[[^\]]+\]/ const INCORRECT_INDENTATION = /^(?: {0}| {1}| {2}| {4,})(?! )/ // Everything except for three spaces const SECTION_HEADER_REGEX = /^\d+(?:\.\d+)*\.\s+.+$/ const QUOTE_CLOSE_RE = /[“”‘’"']\s*$/u const EXPIRES_FOOTER_RE = /Expires\s+((\d{1,2})?\s?(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4})\s+\[Page\s+\d+\]/gi // Author regexps const AUTHORS_OR_EDITORS_ADDRESSES_RE = /^(Authors?|Editors?)[\u2018\u2019\u201B'`"] Addresses$/i const AUTHOR_INFORMATION_RE = /^[0-9a-z.]*\s*author information$/i const AUTHOR_CONTACT_INFORMATION_RE = /^[0-9a-z.]*\s*(author|editor)(?:[\u2018\u2019\u201B'`"]s?|s)?\s+contact information$/i const CONTACT_INFORMATION_RE = /^[0-9a-z.]*\s*contact information$/i const AUTHOR_EDITORS_RE = /^[0-9a-z.]*\s*(author|editor)s?:?$/i const AUTHOR_ADDRESS_OPTIONAL_PLURAL_RE = /^(Author|Authors|Editor|Editors)['’`ʼ]s?\s+Address(?:es)?$/i const AUTHOR_SECTION_RE = new RegExp( `(${AUTHORS_OR_EDITORS_ADDRESSES_RE.source}|` + `${AUTHOR_INFORMATION_RE.source}|` + `${AUTHOR_CONTACT_INFORMATION_RE.source}|` + `${CONTACT_INFORMATION_RE.source}|` + `${AUTHOR_EDITORS_RE.source}|` + `${AUTHOR_ADDRESS_OPTIONAL_PLURAL_RE.source})`, 'i' ) // Unnumbered section titles regex const UNNUMBERED_SECTION_TITLES_RE = new RegExp( `^(Abstract|Table of Contents|Copyright Notice|Status of This Memo|IANA Considerations|${AUTHOR_SECTION_RE.source})\\s*$`, 'i' ) // Inline code format const INLINE_CODE_FORMAT = /\/\*|\*\/|^ *#/ig // Section matchers const sectionMatchers = [ { name: 'introduction', regex: /^\d+\.\s+(Introduction|Overview|Background)$/i }, { name: 'securityConsiderations', regex: /^\d+\.\s+Security Considerations$/i }, { name: 'authorAddress', regex: AUTHOR_SECTION_RE }, { name: 'references', regex: /^\d+\.\s+(?:(?:Normative|Informative)\s+)?References$/i }, { name: 'ianaConsiderations', regex: /^\d+\.\s+IANA Considerations$/i } ] const subsectionMatchers = [ { name: 'normative_references', regex: /^\d+\.\d+\.\s+Normative\s+References$/i }, { name: 'informative_references', regex: /^\d+\.\d+\.\s+Informative\s+References$/i }, { name: 'unclassified_references', regex: /^\d+\.\d+\.\s+([a-zA-Z]+\s+)*Reference(s)?$/i } ] // Boilerplate regex patterns const BOILERPLATE_PATTERNS = { rfc2119: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", ?("NOT RECOMMENDED", )?"MAY", and "OPTIONAL" in this document are to be interpreted as described in( BCP 14,)? RFC ?2119[.,;]/ig, rfc2119_alt: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", ?("NOT RECOMMENDED", )?"MAY", and "OPTIONAL" in this document are to be interpreted as described in "Key words for use in RFCs to Indicate Requirement Levels" \[RFC2119\]/ig, rfc8174: /The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in BCP 14 \[RFC2119\] \[RFC8174\]/ig } // Similar boilerplate regex pattern const BOILERPLATE_PARTS = { rfc2119: [ /The key words /g, /"MUST", "MUST NOT", "REQUIRED", "SHALL"/g, /"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g, /"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g, /in this document are to be interpreted as described in/g, /RFC ?2119[.,;]?/i ], rfc2119_alt: [ /The key words /g, /"MUST", "MUST NOT", "REQUIRED", "SHALL"/g, /"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g, /"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g, /in this document are to be interpreted as described in/g, /"Key words for use in RFCs to Indicate Requirement Levels" \[RFC2119\]/gi ], rfc8174: [ /The key words /g, /"MUST", "MUST NOT", "REQUIRED", "SHALL"/g, /"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED"/g, /"NOT RECOMMENDED", "MAY", and "OPTIONAL"/g, /in this document are to be interpreted as described in BCP 14/g, /\[RFC2119\] \[RFC8174\]/ig ] } // License declaration const licenseDeclarations = { revised_bsd_license: /Code Components extracted from this document must include Revised BSD License text as described in Section 4\.e of the Trust Legal Provisions and are provided without warranty as described in the Revised BSD License\./gi, previous_tlp4_6_b_i: /Copyright\s+\(c\)\s+\d{4}\s+IETF Trust|Copyright\s+\(C\)\s+\d{4}\s+The Internet Society/gi, trust_28_dec_2009_section_6_a: /This\s+Internet-Draft\s+is\s+submitted\s+in\s+full\s+conformance\s+with\s+the\s+provisions\s+of\s+BCP\s+78\s+and\s+BCP\s+79/gi, tlp5_6_b_i_copyright: /Copyright\s+\(c\)\s+(\d{4})\s+IETF Trust\s+and\s+the\s+persons\s+identified\s+as\s+the\s+document\s+authors\.?\s+All\s+rights\s+reserved\.?/gi, license6_b_i: /This document is subject to BCP 78 and the IETF Trust[’']s Legal Provisions\s+Relating to IETF Documents\s+\(https?:\/\/trustee\.ietf\.org\/license-info\) in effect on the\s+date of publication of this document\. Please review these documents carefully, as\s+they describe your rights and restrictions with respect to this document\. Code\s+Components extracted from this document must include Revised BSD License\s+text as described in Section 4\.e of the Trust Legal Provisions and are provided\s+without warranty as described in the Revised BSD License\./gi, license6_b_ii: /This document is subject to BCP 78 and the IETF Trust[’']s Legal Provisions\s+Relating to IETF Documents\s+\(https?:\/\/trustee\.ietf\.org\/license-info\) in effect on the\s+date of publication of this document\. Please review these documents carefully, as\s+they describe your rights and restrictions with respect to this document\./gi, license6_c_i: /This document may not be modified, and derivative works of it may not be\s+created, except to format it for publication as an RFC or to translate it into\s+languages other than English\./gi, license6_c_ii: /This document may not be modified, and derivative works of it may not be\s+created, and it may not be published except as an Internet-Draft\./gi, acceptable_paragraph_noting_that_draft: /Internet-Drafts are working documents of the Internet Engineering Task Force \(IETF\)\./gi, draft_paragraph_out6_month_validity: /Internet-Drafts are draft documents valid for a maximum of six months and may be updated, replaced, or obsoleted by other documents at any time\. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress."\s*/gi, draft_paragraph_pointing_to_the_list_of_current_ids: /The list of current Internet-Drafts is at https:\/\/datatracker\.ietf\.org\/drafts\/current\/|The list of current Internet-Drafts can be accessed at http:\/\/www\.ietf\.org\/ietf\/1id-abstracts\.txt\./gi } // Keywords regex pattern const KEYWORDS_PATTERN = /((NOT)\s)?(MUST|REQUIRED|SHALL|SHOULD|RECOMMENDED|OPTIONAL|MAY)(\s(NOT))?/g // Invalid combinations regex pattern const INVALID_COMBINATIONS_PATTERN = /(MUST not|SHALL not|SHOULD not|not RECOMMENDED|MAY NOT|NOT REQUIRED|NOT OPTIONAL)/g // Obsolete and updates regex patterns const OBSOLETES_RE = /(?:obsoletes|replaces)\s*:\s*((?:rfc\s*)?[0-9]+(?:,|\s|and)*\s*)+/gi const UPDATES_RE = /updates\s*:\s*((?:rfc\s*)?[0-9]+(?:,|\s|and)*\s*)+/gi // Consts values const MAX_PAGE_LENGTH = 58 const TITLE_SECTION_LOOKAHEAD = 3 const HEADER_MAX_LINES = 18 /** * @typedef {Object} TXTDocObject * @property {Object} data Parsed TXT tree * @property {string} docKind Whether the document is an Internet Draft (draft) or an RFC (rfc) * @property {string} filename Filename of the document * @property {string} type Document file type (txt) * @property {number} version Document version number (2 or 3) * @property {string} versionCertainty Whether the version was explicity specified (strict) or guessed (guess) */ /** * Parse Text document * * @param {string} rawText Input text * @param {string} filename Filename of the document * @returns {TXTDocObject} Parsed document object */ export async function parse (rawText, filename) { const rawFixed = rawText .replace(/\r?\n/g, '\n') .replace(/-\s*\n\s*/g, '-') .replace(/\/\s*\n\s*/g, '/') const normalizedText = rawFixed.replace(/\s+/g, ' ').trim() const pages = rawText?.split(/\f/) const data = { pageCount: 1, header: { authors: [], date: null, source: null, expires: null }, content: { abstract: null, introduction: null, securityConsiderations: null, authorAddress: null, references: null, ianaConsiderations: null }, contains: { copyrightSection6_b_i: false, copyrightLicenseValid: false, license6_c_i: false, license6_c_ii: false, revisedBsdLicense: false, codeBlocks: false, draftParagraphOutSixMonthValidity: false, acceptableParagraphNotingThatDraft: false, idIndication: false, revisedBsdLicense6_i: false, submissionCompliance: false, pagesFound: 0, previous6_b_i_copyright: false }, title: null, slug: null, possibleIssues: { hyphenatedLines: [], linesWithSpaces: [], unexpectedIndentation: [], inlineCode: [], misspeled2119Keywords: [], pageLineWithFormFeed: [], paragraphPointingToTheListOfCurrentId: [], copyrightLines6_i: [], isCopyrightNoticeNumbered: false, isStatusOfThisMemoNumbered: false, isAbstractNumbered: false, isPKorBM: false, updatesRfcWithLetter: [], obsoletesWithLetter: [], missingPageNumbering: [], submissionCompliancePage: null, isTableOfContentsExists: false, tooLongPages: [] }, extractedElements: { fqdnDomains: [], ipv4: [], ipv6: [], keywords2119: [], boilerplate2119Keywords: [], obsoletesRfc: [], updatesRfc: [], nonReferenceSectionRfc: [], referenceSectionRfc: [], nonReferenceSectionDraftReferences: [], referenceSectionDraftReferences: [], copyrightDates: [], license6_b_ii: [], license6_b_i: [], bracketedRfcNonReferences: [], bracketedRfcReferences: [], lastPageExpiration: null }, boilerplate: { rfc2119: BOILERPLATE_PATTERNS.rfc2119.test(normalizedText) || BOILERPLATE_PATTERNS.rfc2119_alt.test(normalizedText), rfc8174: BOILERPLATE_PATTERNS.rfc8174.test(normalizedText), similar2119boilerplate: false }, references: { rfc2119: false, rfc8174: false } } let docKind = null let lineIdx = 0 let currentSection = null let currentSubSection = null let inCodeBlock = false let rfcMatch = null let draftMatch = null let prevLine = null let isQuote = false let isContinuation = false let isPageSeparator = false let isTableOfContentSection = false let currentPageLineCount = 0 try { const markers = { header: { start: 0, end: 0, lastAuthor: 0, closed: false }, title: 0, slug: 0, abstract: { start: 0, end: 0, closed: false }, introduction: { start: 0, end: 0, closed: false }, securityConsiderations: { start: 0, end: 0, closed: false }, authorAddress: { start: 0, end: 0, closed: false }, references: { start: 0, end: 0, closed: false }, ianaConsiderations: { start: 0, end: 0, closed: false } } const cleanedNormalized = removeBoilerplates(normalizedText) const candidateFragments = extractCandidateHyphenFragments(cleanedNormalized) const hyphenIssues = findHyphenIssuesInRaw(rawText, candidateFragments) data.possibleIssues.hyphenatedLines = data.possibleIssues.hyphenatedLines || [] data.possibleIssues.hyphenatedLines.push(...hyphenIssues) // Extracting mentioned rfc2119 keywords for (const pattern of Object.values(BOILERPLATE_PATTERNS)) { const match = normalizedText.match(pattern) if (match) { const keywordMatches = match[0].matchAll(KEYWORDS_PATTERN) for (const keywordMatch of keywordMatches) { const keyword = keywordMatch[0] if (!data.extractedElements.boilerplate2119Keywords.includes(keyword)) { data.extractedElements.boilerplate2119Keywords.push(keyword) } } } } // Searching acceptable paragraph pointing to the list of current ids const matchParagraphPointingToTheListOfCurrentId = [...normalizedText.matchAll(licenseDeclarations.draft_paragraph_pointing_to_the_list_of_current_ids)] if (matchParagraphPointingToTheListOfCurrentId.length) { data.possibleIssues.paragraphPointingToTheListOfCurrentId = matchParagraphPointingToTheListOfCurrentId.map((item) => item[0]) } data.boilerplate.similar2119boilerplate = hasBoilerplateMatch(normalizedText, BOILERPLATE_PARTS.rfc2119, BOILERPLATE_PARTS.rfc2119_alt, BOILERPLATE_PARTS.rfc8174) && !(data.boilerplate.rfc2119 || data.boilerplate.rfc8174) // Extracting obsolete and updated rfc from header const obsoletesRfc = extractRfcNumbers(normalizedText, OBSOLETES_RE) data.extractedElements.obsoletesRfc.push(...obsoletesRfc.plainNumbers) data.possibleIssues.obsoletesWithLetter.push(...obsoletesRfc.rfcWithPrefix) const updatesRfcs = extractRfcNumbers(normalizedText, UPDATES_RE) data.extractedElements.updatesRfc.push(...updatesRfcs.plainNumbers) data.possibleIssues.updatesRfcWithLetter.push(...updatesRfcs.rfcWithPrefix) // Searching for license declaration data.contains.revisedBsdLicense = licenseDeclarations.revised_bsd_license.test(normalizedText) // Page separator counting const pagesMatches = rawText.match(PAGES_RE) data.contains.pagesFound = pagesMatches?.length ? pagesMatches.length : 0 // Searching for submission compliance line and pages for (let i = 0; i < pages.length; i++) { licenseDeclarations.trust_28_dec_2009_section_6_a.lastIndex = 0 const normalizedPage = pages[i].replace(/\s+/g, ' ').trim() if (licenseDeclarations.trust_28_dec_2009_section_6_a.test(normalizedPage)) { data.contains.submissionCompliance = true data.possibleIssues.submissionCompliancePage = i + 1 break } } // Searching acceptable paragraph noting that IDs are working documents data.contains.acceptableParagraphNotingThatDraft = licenseDeclarations.acceptable_paragraph_noting_that_draft.test(normalizedText) // Extracting expiration date from the last page const lastPageExpiration = [...normalizedText.matchAll(EXPIRES_FOOTER_RE)].pop() if (lastPageExpiration) { const expiresStr = lastPageExpiration[1] const parsed = DateTime.fromFormat(expiresStr, 'd LLLL yyyy') || DateTime.fromFormat(expiresStr, 'LLLL d, yyyy') if (parsed.isValid) { data.extractedElements.lastPageExpiration = parsed } } // Searching for copyright line in normalized text const match = [...normalizedText.matchAll(licenseDeclarations.tlp5_6_b_i_copyright)] if (match.length > 0) { const copyrightText = match.map(m => m[0]) data.possibleIssues.copyrightLines6_i = copyrightText if (copyrightText.length) data.contains.copyrightSection6_b_i = true const copyrightYears = match.map(m => Number(m[1]) || Number(m[2])) if (copyrightYears.length) data.extractedElements.copyrightDates = copyrightYears } // Search for 6.b license declaration const match6bi = [...normalizedText.matchAll(licenseDeclarations.license6_b_i)] // Search for 6.b.ii license declaration const match6bii = [...normalizedText.matchAll(licenseDeclarations.license6_b_ii)] if (match6bii.length > 0) { data.extractedElements.license6_b_ii = match6bii.map(m => m[0]) } if (match6bi.length > 0) { data.contains.revisedBsdLicense6_i = true data.extractedElements.license6_b_i = match6bi.map(m => m[0]) } // Serach for 6.c licenses declaration data.contains.license6_c_i = licenseDeclarations.license6_c_i.test(normalizedText) data.contains.license6_c_ii = licenseDeclarations.license6_c_ii.test(normalizedText) // Searching for copyright line data.contains.previous6_b_i_copyright = licenseDeclarations.previous_tlp4_6_b_i.test(normalizedText) for (const line of rawText.split('\n')) { const trimmedLine = line.trim() const normalizedLine = line.replace(/\s/g, ' ').replace(/\r\n/g, '\n') lineIdx++ // Pages not numbered if (prevLine && !prevLine.includes('[Page') && line.includes('\f')) { data.possibleIssues.missingPageNumbering.push({ page: data.pageCount, lines: lineIdx }) } prevLine = line // Page Break // -------------------------------------------------------------- if (line.indexOf('\f') >= 0) { data.pageCount++ if (line.includes('[Page') && line.includes('\f')) { data.possibleIssues.pageLineWithFormFeed.push({ page: data.pageCount - 1, lines: lineIdx }) } if (currentPageLineCount > MAX_PAGE_LENGTH) { data.possibleIssues.tooLongPages.push({ page: data.pageCount - 1, lines: currentPageLineCount }) } currentPageLineCount = 0 continue } else { currentPageLineCount++ } // Empty line // -------------------------------------------------------------- if (!trimmedLine) { isContinuation = false isPageSeparator = false continue } // Check line spaces if (SPACING_PATTERN.test(trimmedLine) && !trimmedLine.trim().startsWith('Internet.Draft') && !trimmedLine.trim().startsWith('INTERNET.DRAFT')) { data.possibleIssues.linesWithSpaces.push({ line: lineIdx, pos: line.length }) } // Code block detection if (/<CODE BEGINS>/i.test(trimmedLine)) { data.contains.codeBlocks = true inCodeBlock = true } if (/<CODE ENDS>/i.test(trimmedLine)) { inCodeBlock = false } // Check for inline code format outside code blocks if (!inCodeBlock) { const match = INLINE_CODE_FORMAT.exec(line) if (match) { data.possibleIssues.inlineCode.push({ line: lineIdx, pos: ++match.index }) } } // Search for bad identations. Starts search after detecting abstract section to avoid false positives if (TABLE_OF_CONTENTS.test(normalizedLine)) { // Detection TOC section to avoid false positives isTableOfContentSection = true } if (isTableOfContentSection && !TOC_CONTENT_LINE_RE.test(trimmedLine) && SECTION_PATTERN.test(trimmedLine)) { // Exiting TOC section when new section is detected isTableOfContentSection = false } if (SECTION_HEADER_REGEX.test(normalizedLine)) { isContinuation = true } if (normalizedLine.includes('[Page')) { isPageSeparator = true } if (markers.abstract.start && !isTableOfContentSection) { if (SECTION_TITLE_WITH_INDENTATION_RE.test(normalizedLine)) { data.possibleIssues.unexpectedIndentation.push({ line: lineIdx, pos: 0 }) } else if (INCORRECT_INDENTATION.test(normalizedLine) && !(isContinuation || isQuote || isPageSeparator) && !UNNUMBERED_SECTION_TITLES_RE.test(normalizedLine)) { data.possibleIssues.unexpectedIndentation.push({ line: lineIdx, pos: 0 }) } } if (REFERENCE_LINE_RE.test(normalizedLine) && currentSection === 'references') { isContinuation = true } if (trimmedLine.endsWith(':')) { isQuote = true } else if (QUOTE_CLOSE_RE.test(trimmedLine)) { isQuote = false } // Extract rfc references from whole text exept of reference section while ((rfcMatch = RFC_REFERENCE_RE.exec(trimmedLine)) !== null) { const rfcNumber = rfcMatch[1] || rfcMatch[2] if (currentSection !== 'references') { if (rfcNumber && !data.extractedElements.nonReferenceSectionRfc.includes(rfcNumber)) { data.extractedElements.nonReferenceSectionRfc.push(rfcNumber) } } else if (BRACKETED_RFC_REFERENCE_RE.exec(trimmedLine)) { if (rfcNumber && !data.extractedElements.referenceSectionRfc.find((el) => el.value === rfcNumber)) { data.extractedElements.referenceSectionRfc.push({ value: rfcNumber, subsection: currentSubSection }) } } // Detect bracketed RFC references if (rfcMatch[0]) { if (BRACKETED_RFC_REFERENCE_RE.test(rfcMatch[0])) { if (currentSection === 'references') { data.extractedElements.bracketedRfcReferences.push(rfcMatch[0]) } else { data.extractedElements.bracketedRfcNonReferences.push(rfcMatch[0]) } } } } // Detect draft references while ((draftMatch = NON_RFC_REFERENCE_RE.exec(trimmedLine)) !== null) { const draftName = draftMatch[0] if (currentSection !== 'references') { if (!data.extractedElements.nonReferenceSectionDraftReferences.includes(draftName)) { data.extractedElements.nonReferenceSectionDraftReferences.push(draftName) } } else { if (!data.extractedElements.referenceSectionDraftReferences.find((el) => el.value === draftName)) { data.extractedElements.referenceSectionDraftReferences.push({ value: draftName, subsection: currentSubSection }) } } } // Searching acceptable paragraph calling out 6 month validity data.contains.draftParagraphOutSixMonthValidity = licenseDeclarations.draft_paragraph_out6_month_validity.test(normalizedText) if (STATUS_OF_THIS_MEMO_RE.test(trimmedLine)) { data.possibleIssues.isStatusOfThisMemoNumbered = true } if (FIRST_LINE_RE.test(trimmedLine) && lineIdx === 1) { data.possibleIssues.isPKorBM = true } if (TABLE_OF_CONTENTS.test(trimmedLine)) { data.possibleIssues.isTableOfContentsExists = true } // Check for references if (/\[RFC2119\]/i.test(trimmedLine)) { data.references.rfc2119 = true } if (/\[RFC8174\]/i.test(trimmedLine)) { data.references.rfc8174 = true } // Check for keywords const keywordMatches = [...trimmedLine.matchAll(KEYWORDS_PATTERN)] keywordMatches.forEach(match => { data.extractedElements.keywords2119.push({ keyword: match[0], line: lineIdx }) }) // Check for invalid keyword combinations const invalidMatches = [...line.matchAll(INVALID_COMBINATIONS_PATTERN)] invalidMatches.forEach(match => { data.possibleIssues.misspeled2119Keywords.push({ invalidKeyword: match[0], line: lineIdx, pos: ++match.index }) }) // FQRN Domain extraction const domainMatches = [...trimmedLine.matchAll(FQDN_RE)] if (domainMatches.length > 0) { domainMatches.forEach(match => data.extractedElements.fqdnDomains.push(match.groups.domain)) } // IPv4 and IPv6 extraction const ipv4Matches = [...trimmedLine.matchAll(IPV4_LOOSE_RE)] if (ipv4Matches.length > 0) { ipv4Matches.forEach(match => data.extractedElements.ipv4.push(match[0])) } const ipv6Matches = [...trimmedLine.matchAll(IPV6_LOOSE_RE)] if (ipv6Matches.length > 0) { ipv6Matches.forEach(match => data.extractedElements.ipv6.push(match[0])) } // Header // -------------------------------------------------------------- if (!markers.header.start) { // -> First Line markers.header.start = lineIdx markers.header.end = lineIdx const values = LINE_VALUES_EXTRACT_RE.exec(trimmedLine) // --> Source data.header.source = values?.groups.left // --> Author data.header.authors.push({ name: values?.groups.right }) markers.header.lastAuthor = lineIdx continue } else if (!markers.header.closed) { if (lineIdx > markers.header.end + 1) { markers.header.closed = true markers.title = lineIdx data.title = trimmedLine } else { markers.header.end = lineIdx const extractedValues = LINE_VALUES_EXTRACT_RE.exec(line) const values = extractedValues ? extractedValues.groups : { left: trimmedLine, right: null } if (values.left) { // --> Date const match = values.left.match(DATE_RE) if (match) { const day = parseInt(match[1], 10) const month = match[2] const year = parseInt(match[3], 10) data.header.date = { day, month, year } } // --> Document Kind if (values.left.includes('Internet-Draft')) { docKind = 'draft' data.contains.idIndication = true } else if (values.left.startsWith('Request for Comments')) { data.header.rfcNumber = values.left.split(':')?.[1]?.trim() docKind = 'rfc' } else if (filename.startsWith('rfc')) { const match = filename.match(/rfc(\d+)\.txt$/i) if (match) { data.header.rfcNumber = match[1] docKind = 'rfc' } } else { docKind = 'draft' } // --> Intended status if (values.left.startsWith('Intended')) { const rawIntendedStatus = values.left.split(':')?.[1]?.trim() const cleanIntendedStatus = extractStatusName(rawIntendedStatus) data.header.intendedStatus = cleanIntendedStatus || rawIntendedStatus } // --> Obsoletes if (values.left.startsWith('Obsoletes')) { const obsoletesValues = values.left.split(':')?.[1]?.trim() data.header.obsoletes = obsoletesValues.indexOf(',') >= 0 ? obsoletesValues.split(',').map(o => o.trim()) : [obsoletesValues] } // --> Category if (values.left.startsWith('Category')) { const rawCategory = values.left.split(':')?.[1]?.trim() const cleanCategory = extractStatusName(rawCategory) data.header.category = cleanCategory || rawCategory } // --> ISSN if (values.left.startsWith('ISSN')) { data.header.issn = values.left.split(':')?.[1]?.trim() } // --> Expires if (values.left.startsWith('Expires')) { const datePart = values.left.split(':')[1]?.trim().split(/\s{2,}/)[0] const DATE_RE = /(?:(?<month>[A-Za-z]{3,9})[\s]+(?<day>\d{1,2}),?\s*(?<year>\d{4}))|(?:(?<dayAlt>\d{1,2})[\s\-/]*(?<monthAlt>[A-Za-z]{3,9})[\s\-/,]*(?<yearAlt>\d{4}))|(?<iso>\d{4}-\d{2}-\d{2})/ const dateValue = DATE_RE.exec(datePart) if (dateValue) { if (dateValue.groups.iso) { data.header.expires = DateTime.fromISO(dateValue.groups.iso) } else { const day = dateValue.groups.day || dateValue.groups.dayAlt || 1 const month = dateValue.groups.month || dateValue.groups.monthAlt const year = dateValue.groups.year || dateValue.groups.yearAlt data.header.expires = DateTime.fromFormat( `${day} ${month} ${year}`, 'd LLLL yyyy' ) } } } } if (values.right) { // --> Date const dateValue = DATE_RE.exec(values.right) if (dateValue) { const day = parseInt(dateValue[1], 10) const month = dateValue[2] const year = parseInt(dateValue[3], 10) data.header.date = { day, month, year } } if (!data.header.date) { // --> Author const authorNameValue = AUTHOR_NAME_RE.exec(values.right) if (authorNameValue) { // --> Blank line = Previous author(s) have no affiliation if (lineIdx > markers.header.lastAuthor + 1) { data.header.authors.findLast(el => { if (el.org || el.org === '') { return true } else { el.org = '' return false } }) } // --> Author Name data.header.authors.push({ name: authorNameValue[0] }) } else if (values.right) { // --> Author Org data.header.authors.findLast(el => { if (el.org || el.org === '') { return true } else { el.org = values.right return false } }) } markers.header.lastAuthor = lineIdx } } } } if ((data.title && lineIdx <= markers.title + TITLE_SECTION_LOOKAHEAD) || (!data.title && lineIdx < HEADER_MAX_LINES)) { if (trimmedLine.startsWith('draft-')) { markers.slug = lineIdx data.slug = trimmedLine continue } } if (COPYRIGHT_NOTICE_RE.test(trimmedLine)) { data.possibleIssues.isCopyrightNoticeNumbered = true } // Abstract // -------------------------------------------------------------- if (trimmedLine === 'Abstract' || ABSTRACT_RE.test(trimmedLine)) { markers.abstract.start = lineIdx currentSection = 'abstract' data.content.abstract = [] } else if (markers.abstract.start && !markers.abstract.closed) { if (trimmedLine.startsWith('Status of') || !line.startsWith(' ')) { markers.abstract.end = lineIdx - 1 markers.abstract.closed = true } } if (!markers.header.start) { markers.header.start = lineIdx markers.header.end = lineIdx const values = LINE_VALUES_EXTRACT_RE.exec(trimmedLine) if (values) { data.header.source = values.groups.left data.header.authors.push({ name: values.groups.right }) } markers.header.lastAuthor = lineIdx continue } else if (!markers.header.closed) { if (lineIdx > markers.header.end + 1) { markers.header.closed = true markers.title = lineIdx data.title = !trimmedLine.startsWith('draft-') ? trimmedLine : null } else { markers.header.end = lineIdx } continue } // Section detection and content assignment if ((SECTION_PATTERN.test(trimmedLine) || AUTHOR_SECTION_RE.test(trimmedLine)) && !ABSTRACT_RE.test(trimmedLine)) { const matchedSection = sectionMatchers.find(({ regex }) => regex.test(trimmedLine)) if (currentSection && !markers[currentSection].closed) { markers[currentSection].end = lineIdx - 1 markers[currentSection].closed = true } if (matchedSection) { currentSection = matchedSection.name markers[currentSection].start = lineIdx data.content[currentSection] = [] } else { currentSection = null } } // Sub section detection if (SUBSECTION_PATTERN.test(trimmedLine) && !TOC_PATTERN.test(trimmedLine)) { if (subsectionMatchers.some(({ regex }) => regex.test(trimmedLine))) { const matchedSubsection = subsectionMatchers.find(({ regex }) => regex.test(trimmedLine)) currentSubSection = matchedSubsection.name } else currentSubSection = null } // Add content to the current section if (currentSection && markers[currentSection].start && !markers[currentSection].closed) { data.content[currentSection].push(trimmedLine) } } // Close the last section if (currentSection && !markers[currentSection].closed) { markers[currentSection].end = lineIdx markers[currentSection].closed = true } if (data.content?.abstract?.length) { const firstLine = data.content.abstract[0].trim() if (ABSTRACT_RE.test(firstLine)) { data.possibleIssues.isAbstractNumbered = true } } data.markers = markers } catch (err) { throw new ValidationError('TXT_PARSING_FAILED', `Error while parsing Line ${lineIdx}: ${err.message}`) } return { docKind, body: rawText, data, filename, type: 'txt' } } /** * Function to check if at least one match is found among the specified groups of patterns in the text * * @param {string} text Normalized text * @param {...Array<RegExp>} regexGroups Arrays of patterns to check * @returns {boolean} Whether at least one match is found */ function hasBoilerplateMatch (text, ...regexGroups) { for (const group of regexGroups) { let matchCount = 0 for (const part of group) { if (part.test(text)) { matchCount++ } else { break } } if (matchCount > 0) { return true } } return false } /** * Extract RFC numbers from the text * * @param {string} text Text to extract RFC numbers from * @param {RegExp} regex Regular expression to extract RFC numbers * @returns {Array<string>} Extracted RFC numbers */ function extractRfcNumbers (text, regex) { const matches = { rfcWithPrefix: [], plainNumbers: [] } let match while ((match = regex.exec(text)) !== null) { const rfcList = match[0] if (rfcList) { const numbers = rfcList .match(/\b(RFC\s*[0-9]+|[0-9]+)\b/gi) ?.map(num => num.trim()) || [] numbers.forEach(num => { if (/^RFC\s*[0-9]+$/i.test(num)) { matches.rfcWithPrefix.push(num) matches.plainNumbers.push(num.replace(/^RFC\s*/i, '')) } else { matches.plainNumbers.push(num) } }) } } return matches } /** * Extracts the clean status name from a given status text using predefined regular expressions. * * This function iterates through an array of predefined RFC statuses, each containing * a name, regex pattern, and category. It tests the given status text against each regex * and returns the corresponding clean status name if a match is found. * * @param {string} statusText - The raw status text to be processed (e.g., "Standards Track Juniper Networks"). * @returns {string|null} - The clean name of the status (e.g., "Proposed Standard") if matched, * or `null` if no matching status is found. * * Example: * const rawStatus = "Standards Track Juniper Networks"; * const cleanStatus = extractStatusName(rawStatus); * console.log(cleanStatus); // Output: "Proposed Standard" */ function extractStatusName (statusText) { for (const status of rfcStatusHierarchy) { if (status.regex.test(statusText)) { return status.name } } return null } /** * Removes boilerplate sections from the provided text using the regular expressions * defined in licenseDeclarations. * * @param {string} text - The input text. * @returns {string} - The text with boilerplate sections removed. */ function removeBoilerplates (text) { let cleaned = text for (const key in licenseDeclarations) { cleaned = cleaned.replace(licenseDeclarations[key], '') } return cleaned } /** * Escapes special regex characters in a string to safely use it in a regular expression. * * @param {string} str - The input string. * @returns {string} - The escaped string. */ function escapeRegExp (str) { return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') } /** * Extracts candidate hyphen fragments from the cleaned text. * It looks for a pattern where a word (prevWord) is followed by whitespace and then * another word ending with a hyphen (fragment), ensuring that a letter follows the hyphen. * * @param {string} cleanedText - The text after boilerplate sections have been removed. * @returns {Array<Object>} - An array of objects with properties: * - prevWord: the word preceding the hyphenated fragment, * - fragment: the hyphenated fragment (word ending with a hyphen). */ function extractCandidateHyphenFragments (cleanedText) { const pattern = /(\b[^\s-]+)\s+([^\s-]+-)(?=[A-Za-zА-Яа-яІіЇїЄє])/g const candidates = [] let match while ((match = pattern.exec(cleanedText)) !== null) { candidates.push({ prevWord: match[1], fragment: match[2] }) } return candidates } /** * Finds hyphenation issues in the raw text using the candidate fragments. * For each candidate, it creates a regex to locate an exact sequence where the candidate * (a previous word followed by whitespace and a hyphenated fragment) appears at the end of a line. * If the line ends with a hyphen and contains the exact sequence, it is recorded as an issue. * * @param {string} rawText - The original raw text. * @param {Array<Object>} candidateFragments - Array of candidate objects with properties prevWord and fragment. * @returns {Array<Object>} - An array of issue objects, each containing: * - line: the line number in which the issue was found, * - pos: the position (length) of the line. */ function findHyphenIssuesInRaw (rawText, candidateFragments) { const issues = [] const lines = rawText.split('\n') candidateFragments.forEach(candidate => { const regex = new RegExp('\\b' + escapeRegExp(candidate.prevWord) + '\\s+' + escapeRegExp(candidate.fragment) + '(?![A-Za-zА-Яа-яІіЇїЄє])') lines.forEach((line, index) => { if (line.trim().endsWith('-') && regex.test(line)) { issues.push({ line: index + 1, pos: line.length }) } }) }) return issues }