@slash-tmp/rgaa-scraper
Version:
Scrapes RGAA's website and fetch topics, criteria and tests into a JSON format.
113 lines • 4.33 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseTopicA = exports.parseTestLi = exports.parseCriteriaArticle = void 0;
const cheerio_1 = __importDefault(require("cheerio"));
const utils_1 = require("./utils");
const criteriaRegEx = /^Critère ((\d+\.)+) (.*)$/;
function parseCriterionAside(elements) {
return elements
.map(el => cheerio_1.default(el))
.map(el => {
if (el.is('ul')) {
const res = el
.children()
.toArray()
.map(childEl => '- ' + utils_1.reduceWhitespaces(cheerio_1.default(childEl).text().trim()))
.join('\n');
return res;
}
else {
return utils_1.reduceWhitespaces(el.text().trim());
}
})
.join('\n');
}
function parseCriteriaArticle(articleCheerio) {
const h4 = articleCheerio.find('h4');
// remove the button element from the title
articleCheerio.find('button').remove();
// get the title text and clean it
const titleText = utils_1.reduceWhitespaces(h4.text().trim());
const match = titleText.match(criteriaRegEx);
if (!match || !match[1] || !match[3]) {
throw new Error('Cant parse criteria : ' + articleCheerio.find('h4').text());
}
const id = match[1]
.split('.')
.filter(i => i)
.join('.');
const title = match[3];
const asideElements = {
technicalNotes: [],
particularCases: [],
};
let currentSection = null;
articleCheerio
.find('.aside > *')
.toArray()
.map(el => cheerio_1.default(el))
.forEach(el => {
if (el.is('h5')) {
if (el.text().includes('Notes techniques')) {
currentSection = 'technicalNotes';
}
else if (el.text().includes('Cas particuliers')) {
currentSection = 'particularCases';
}
else {
currentSection = null;
}
}
else if (currentSection) {
asideElements[currentSection].push(el);
}
});
const technicalNotes = parseCriterionAside(asideElements.technicalNotes);
const particularCases = parseCriterionAside(asideElements.particularCases);
const techniques = articleCheerio
.find('li:contains("Technique(s) suffisante(s) et/ou échec(s) WCAG 2.1") a')
.toArray()
.map(el => cheerio_1.default(el).text());
const wcagReferences = articleCheerio
.find('h5:contains("Correspondances EN 301 549 V2.1.2 (2018-08)") + ul > li')
.toArray()
.map(el => cheerio_1.default(el).text().trim().slice(0, -1));
const level = wcagReferences.some(ref => ref.includes('(AA)')) ? 'AA' : 'A';
return Object.assign(Object.assign(Object.assign({ id,
title, references: Object.assign(Object.assign({}, (!!techniques && { techniques })), (!!wcagReferences && { wcag: wcagReferences })) }, (!!technicalNotes && { technicalNotes })), (!!particularCases && { particularCases })), { level });
}
exports.parseCriteriaArticle = parseCriteriaArticle;
function parseTestLi(liCheerio) {
// should have the following format : "test-1-2-3"
const liId = liCheerio.attr('id');
if (!liId)
throw new Error('Cant parse test : no id attribute');
const id = liId.split('-').slice(1).join('.');
const pText = utils_1.reduceWhitespaces(liCheerio.find('p').first().text()).trim();
const listText = liCheerio
.find('ul li p')
.toArray()
.map(el => '- ' + utils_1.reduceWhitespaces(cheerio_1.default(el).text()).trim())
.join('\n');
return {
id,
title: `${pText}${listText.length ? '\n' + listText : ''}`,
};
}
exports.parseTestLi = parseTestLi;
const topicRegex = /(\d\d?)\. (.+)/;
function parseTopicA(aCheerio) {
const match = aCheerio.text().match(topicRegex);
if (!match || !match[1] || !match[2]) {
throw new Error('Cant parse topic : ' + aCheerio.text());
}
return {
id: match[1],
title: match[2],
};
}
exports.parseTopicA = parseTopicA;
//# sourceMappingURL=parser.js.map