@xcrap/parser
Version:
Xcrap Parser is a package of the Xcrap framework, it was developed to take care of the data extraction part of text files (currently supporting only HTML and JSON) using declarative models.
114 lines (113 loc) • 6.2 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.fromPreviousElementSibling = exports.fromNextElementSibling = exports.extractRawAttrs = exports.extarctRawAttributes = exports.extractStructuredText = exports.extractStructure = exports.extractRawText = exports.extractRawTagName = exports.extractNodeType = exports.extractLocalName = exports.extractClassNames = exports.extractChildElementCount = exports.extractAttrs = exports.extarctAttributes = exports.extractRange = exports.extractAttribute = exports.extractAllData = exports.extractAriaDisabled = exports.extractAriaChecked = exports.extractAriaExpanded = exports.extractAriaHidden = exports.extractAriaLabel = exports.extractRequired = exports.extractPattern = exports.extractMinLength = exports.extractMaxLength = exports.extractAutocomplete = exports.extractType = exports.extractName = exports.extractSelected = exports.extractChecked = exports.extractReadonly = exports.extractDisabled = exports.extractPlaceholder = exports.extractTitle = exports.extractRole = exports.extractStyle = exports.extractValue = exports.extractSrc = exports.extractHref = exports.extractId = exports.extractClassList = exports.extractTagName = exports.extractOuterHtml = exports.extractInnerHtml = exports.extractText = exports.extractTextContent = exports.extractInnerText = exports.propertyExtractors = void 0;
exports.extract = extract;
const errors_1 = require("../errors");
exports.propertyExtractors = {
innerText: (element) => element.innerText,
textContent: (element) => element.textContent,
text: (element) => element.text,
innerHTML: (element) => element.innerHTML,
outerHTML: (element) => element.outerHTML,
tagName: (element) => element.tagName,
classList: (element) => Array.from(element.classList.values()),
id: (element) => element.id,
attributes: (element) => element.attributes,
attrs: (element) => element.attrs,
childElementCount: (element) => element.childElementCount,
classNames: (element) => element.classNames,
localName: (element) => element.localName,
nodeType: (element) => element.localName,
range: (element) => element.range,
rawAttributes: (element) => element.rawAttributes,
rawAttrs: (element) => element.rawAttrs,
rawTagName: (element) => element.rawTagName,
rawText: (element) => element.rawText,
structure: (element) => element.structure,
structuredText: (element) => element.structuredText,
};
function extract(key, isAttribute = false) {
return (element) => {
if (isAttribute) {
return element.getAttribute(key);
}
const extractor = exports.propertyExtractors[key];
if (!extractor) {
throw new errors_1.ExtractorNotFoundError(key);
}
return extractor(element);
};
}
exports.extractInnerText = extract("innerText");
exports.extractTextContent = extract("textContent");
exports.extractText = extract("text");
exports.extractInnerHtml = extract("innerHTML");
exports.extractOuterHtml = extract("outerHTML");
exports.extractTagName = extract("tagName");
exports.extractClassList = extract("classList");
exports.extractId = extract("id");
exports.extractHref = extract("href", true);
exports.extractSrc = extract("src", true);
exports.extractValue = extract("value", true);
exports.extractStyle = extract("style", true);
exports.extractRole = extract("role", true);
exports.extractTitle = extract("title", true);
exports.extractPlaceholder = extract("placeholder", true);
exports.extractDisabled = extract("disabled", true);
exports.extractReadonly = extract("readonly", true);
exports.extractChecked = extract("checked", true);
exports.extractSelected = extract("selected", true);
exports.extractName = extract("name", true);
exports.extractType = extract("type", true);
exports.extractAutocomplete = extract("autocomplete", true);
exports.extractMaxLength = extract("maxlength", true);
exports.extractMinLength = extract("minlength", true);
exports.extractPattern = extract("pattern", true);
exports.extractRequired = extract("required", true);
exports.extractAriaLabel = extract("aria-label", true);
exports.extractAriaHidden = extract("aria-hidden", true);
exports.extractAriaExpanded = extract("aria-expanded", true);
exports.extractAriaChecked = extract("aria-checked", true);
exports.extractAriaDisabled = extract("aria-disabled", true);
exports.extractAllData = extract("data-*", true);
const extractAttribute = (name) => extract(name, true);
exports.extractAttribute = extractAttribute;
exports.extractRange = extract("range");
exports.extarctAttributes = extract("attributes");
exports.extractAttrs = extract("attrs");
exports.extractChildElementCount = extract("childElementCount");
exports.extractClassNames = extract("classNames");
exports.extractLocalName = extract("localName");
exports.extractNodeType = extract("nodeType");
exports.extractRawTagName = extract("rawTagName");
exports.extractRawText = extract("rawText");
exports.extractStructure = extract("structure");
exports.extractStructuredText = extract("structuredText");
exports.extarctRawAttributes = extract("rawAttributes");
exports.extractRawAttrs = extract("rawAttrs");
const fromNextElementSibling = (extractor, { shouldExists } = { shouldExists: true }) => {
return (element) => {
const nextElementSibling = element.nextElementSibling;
if (!nextElementSibling) {
if (shouldExists) {
throw new errors_1.HTMLElementNotFoundError();
}
return undefined;
}
return extractor(nextElementSibling);
};
};
exports.fromNextElementSibling = fromNextElementSibling;
const fromPreviousElementSibling = (extractor, { shouldExists } = { shouldExists: true }) => {
return (element) => {
const previousElementSibling = element.previousElementSibling;
if (!previousElementSibling) {
if (shouldExists) {
throw new errors_1.HTMLElementNotFoundError();
}
return undefined;
}
return extractor(previousElementSibling);
};
};
exports.fromPreviousElementSibling = fromPreviousElementSibling;