w3c-xml-validator
Version:
A library for programmatically submitting XML for online validation using W3C's Markup Validation Service (https://validator.w3.org/check).
186 lines (154 loc) • 5.96 kB
JavaScript
const debug = require('debug')('w3c-xml-validator')
const FormData = require('form-data')
const htmlParser = require('node-html-parser')
/**
* `URL` only became available in Node v10. This is included to support Node v8.
*/
const URL = (global.URL || require('whatwg-url').URL)
/**
* Submits the provided XML to the W3C online validation tool. Returns the
* provided result.
*
* @param {String} src The XML to validate. It should reference a publicly-
* available DTD.
*
* @return {Promise}
*/
async function submitValidation (src) {
return new Promise((resolve, reject) => {
src = src || ''
if (src === '') {
reject(new Error('The input parameter is required and must be a non-empty string value.'))
return
}
const form = new FormData()
/**
* These values were determined empirically, by analyzing a request that was
* submitted manually with a web browser.
*/
form.append('fragment', src)
form.append('prefill', '0')
form.append('doctype', 'Inline')
form.append('prefill_doctype', 'html401')
form.append('group', '0')
/**
* This is the web address that the form data will be submitted to. If the
* special parameter value '%%TIMEOUT%%' is used, this will cause the code
* to query an unreachable host (thus testing the behavior in the case of
* ECONNREFUSED).
*/
const dest = (
src === '%%TIMEOUT%%'
? new URL('http://localhost:65001')
: new URL('https://validator.w3.org/check') // this is the normal target
)
debug('Submitting form to %s...', dest)
const startTime = Date.now()
form.submit(
{
protocol: dest.protocol,
host: dest.host,
path: dest.pathname,
headers: {
referrer: dest.origin
}
},
(err, response) => {
debug('...done!')
if (err) {
reject(err)
return
}
/**
* This is the data received back from W3C.
* @type {Array}
*/
const responseData = []
/**
* This is the value that will eventually be returned by this promise.
* @type {Object}
*/
const validationResult = {
isValid: true,
warnings: [],
errors: []
}
response.on('data', (chunk) => {
responseData.push(chunk.toString())
})
response.on('end', () => {
const endTime = Date.now()
debug(' recevied response in %d sec', ((endTime - startTime) / 1000))
debug(' code: %d', response.statusCode)
debug(' headers: %o', response.headers)
const body = responseData.join('')
debug(' length: %d', body.length)
/**
* Anything other than a 200 indicates a problem with the underlying
* HTTP transmission. In that case, abort!
*
* NOTE: this has nothing to do with whether or not the XML is valid.
* W3C actually understands how to use HTTP response codes correctly.
*/
if (response.statusCode !== 200) {
debug(' body: %s', body)
reject(new Error(`The remote server replied with a ${response.statusCode} status code.`))
return
}
/**
* The returned HTML, represented as a basic DOM. This makes it easier
* to parse the results and find the data of interest.
* @type {Object}
*/
const htmlDocument = htmlParser.parse(body)
/**
* Currently, the reported DOCTYPE that W3C assumed is reported in an
* H2 element near the top of the page.
*/
const resultsHeading = htmlDocument.querySelector('#results_container').childNodes[4]
debug('Found results heading node %o', resultsHeading) // these are present to help troubleshoot in case the structure of the HTML changes
/**
* The following two lines are necessary to extract the DOCTYPE from
* the sentence that contains it.
*/
validationResult.doctype = resultsHeading
.childNodes[0]
.rawText
.replace(/\s+/g, ' ')
.replace(/!$/, '')
validationResult.doctype = validationResult.doctype.substring(validationResult.doctype.lastIndexOf(' ') + 1)
/**
* Warnings and errors are contained in separate OL elements futher
* down the page. Fortunately, they have unique identifiers.
*/
const warnings = htmlDocument.querySelector('#warnings')
warnings.childNodes.forEach((child) => {
if (child.nodeType === 1) {
debug('Found warning: %s', child.structure)
const target = child.childNodes[0].childNodes[2]
debug(' third great-grandchild node %o', target)
validationResult.warnings.push(target.rawText)
}
})
const errors = htmlDocument.querySelector('#error_loop')
if (errors) {
validationResult.isValid = false
errors.childNodes.forEach((child) => {
if (child.nodeType === 1 && child.classNames.indexOf('msg_err') > -1) {
debug('Found error: %s', child.structure)
const line = child.childNodes[3]
const msg = child.childNodes[5]
debug(' third grandchild node %o', line)
debug(' sixth grandchild node %o', msg)
validationResult.errors.push(`${line.text.substring(0, line.text.indexOf(','))}: ${msg.text}`)
}
})
}
resolve(validationResult)
})
response.resume() // it's important to setup the event handlers before doing this
}
)
})
}
module.exports = submitValidation