UNPKG

@slash-tmp/rgaa-scraper

Version:

Scrapes RGAA's website and fetch topics, criteria and tests into a JSON format.

github.com/slash-tmp/rgaa-scraper

slash-tmp/rgaa-scraper

113 lines • 4.33 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.parseTopicA = exports.parseTestLi = exports.parseCriteriaArticle = void 0; const cheerio_1 = __importDefault(require("cheerio")); const utils_1 = require("./utils"); const criteriaRegEx = /^Critère ((\d+\.)+) (.*)$/; function parseCriterionAside(elements) { return elements .map(el => cheerio_1.default(el)) .map(el => { if (el.is('ul')) { const res = el .children() .toArray() .map(childEl => '- ' + utils_1.reduceWhitespaces(cheerio_1.default(childEl).text().trim())) .join('\n'); return res; } else { return utils_1.reduceWhitespaces(el.text().trim()); } }) .join('\n'); } function parseCriteriaArticle(articleCheerio) { const h4 = articleCheerio.find('h4'); // remove the button element from the title articleCheerio.find('button').remove(); // get the title text and clean it const titleText = utils_1.reduceWhitespaces(h4.text().trim()); const match = titleText.match(criteriaRegEx); if (!match || !match[1] || !match[3]) { throw new Error('Cant parse criteria : ' + articleCheerio.find('h4').text()); } const id = match[1] .split('.') .filter(i => i) .join('.'); const title = match[3]; const asideElements = { technicalNotes: [], particularCases: [], }; let currentSection = null; articleCheerio .find('.aside > *') .toArray() .map(el => cheerio_1.default(el)) .forEach(el => { if (el.is('h5')) { if (el.text().includes('Notes techniques')) { currentSection = 'technicalNotes'; } else if (el.text().includes('Cas particuliers')) { currentSection = 'particularCases'; } else { currentSection = null; } } else if (currentSection) { asideElements[currentSection].push(el); } }); const technicalNotes = parseCriterionAside(asideElements.technicalNotes); const particularCases = parseCriterionAside(asideElements.particularCases); const techniques = articleCheerio .find('li:contains("Technique(s) suffisante(s) et/ou échec(s) WCAG 2.1") a') .toArray() .map(el => cheerio_1.default(el).text()); const wcagReferences = articleCheerio .find('h5:contains("Correspondances EN 301 549 V2.1.2 (2018-08)") + ul > li') .toArray() .map(el => cheerio_1.default(el).text().trim().slice(0, -1)); const level = wcagReferences.some(ref => ref.includes('(AA)')) ? 'AA' : 'A'; return Object.assign(Object.assign(Object.assign({ id, title, references: Object.assign(Object.assign({}, (!!techniques && { techniques })), (!!wcagReferences && { wcag: wcagReferences })) }, (!!technicalNotes && { technicalNotes })), (!!particularCases && { particularCases })), { level }); } exports.parseCriteriaArticle = parseCriteriaArticle; function parseTestLi(liCheerio) { // should have the following format : "test-1-2-3" const liId = liCheerio.attr('id'); if (!liId) throw new Error('Cant parse test : no id attribute'); const id = liId.split('-').slice(1).join('.'); const pText = utils_1.reduceWhitespaces(liCheerio.find('p').first().text()).trim(); const listText = liCheerio .find('ul li p') .toArray() .map(el => '- ' + utils_1.reduceWhitespaces(cheerio_1.default(el).text()).trim()) .join('\n'); return { id, title: `${pText}${listText.length ? '\n' + listText : ''}`, }; } exports.parseTestLi = parseTestLi; const topicRegex = /(\d\d?)\. (.+)/; function parseTopicA(aCheerio) { const match = aCheerio.text().match(topicRegex); if (!match || !match[1] || !match[2]) { throw new Error('Cant parse topic : ' + aCheerio.text()); } return { id: match[1], title: match[2], }; } exports.parseTopicA = parseTopicA; //# sourceMappingURL=parser.js.map