@carrotsearch/gatsby-transformer-html
Version:
A Gatsby transformer plugin for authoring content in HTML. Supports ToC generation, responsive images, section anchors and HighlightJS code highlighting.
429 lines (382 loc) • 12.6 kB
JavaScript
const path = require("path");
const { GraphQLJSON } = require(`gatsby/graphql`);
const {
replaceVariables,
validateVariables,
createMapReplacer
} = require("./src/replace-variables.js");
const { loadEmbeddedContent } = require("./src/embed-utils");
const { rewriteInternalLinks } = require("./src/rewrite-internal-links.js");
const { generateElementId } = require("./src/generate-element-id.js");
const extractFragment = require("./src/extract-fragment.js");
const extractJsonpath = require("./src/extract-jsonpath.js");
const { CodeHighlighter } = require("./src/transformers/code-highlighter");
const { loadHtml, renderHtml } = require("./src/html-transformer");
const { encode } = require("html-entities");
const { notInPre } = require("./src/cheerio-utils");
const { ImageProcessor } = require("./src/transformers/image-processor");
const { SvgInliner } = require("./src/transformers/svg-inliner");
const { error } = require("./src/reporter-utils");
// The transformation functions should be converted to plugins, but
// for now we keep them integrated to avoid proliferation of boilerplate.
const indexingAllowed = $ => (i, el) => $(el).data("indexing") !== "disabled";
/**
* Embeds code from a separate file. Relative file path provided in the
* data-embed attribute is resolved against the path of the file in which
* the embed tag appears.
*/
const embedCode = ($, dir, variables, reporter) => {
$("pre[data-embed], embed[src]")
.filter(notInPre($))
.replaceWith((i, el) => {
const $el = $(el);
const declaredEmbed = $el.data("embed") || $el.attr("src");
const fragment = $el.data("fragment");
const jsonpath = $el.data("jsonpath");
const declaredLanguage = $el.data("language");
const rawContent = loadEmbeddedContent(
declaredEmbed,
dir,
variables,
reporter
);
if (rawContent === undefined) {
return "";
}
const ext = path.extname(declaredEmbed).substring(1).toLowerCase();
const language = declaredLanguage || ext;
if (jsonpath && fragment) {
throw `jsonpath and fragment are mutually exclusive.`;
}
let content;
if (jsonpath) {
try {
const fragments = extractJsonpath(rawContent, jsonpath);
// there can be more than one matching path... should we bail out if this is the case?
// for now, let's just emit a pre for each path output.
return fragments.map(ob => `<pre data-language=${language}>${encode(ob)}</pre>`).join("\n");
} catch (e) {
error(`Failed do embed jsonpath: ${e}.`, reporter);
content = "";
}
} else if (fragment) {
try {
content = extractFragment(rawContent, fragment);
} catch (e) {
error(`Failed do embed content: ${e}.`, reporter);
content = "";
}
} else {
content = rawContent;
}
// Encode entities inside the embedded fragment.
return `<pre data-language=${language}>${encode(content)}</pre>`;
});
return $;
};
const anchorSvg = `<svg aria-hidden="true" height="16" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z" /></svg>`;
const addSectionAnchors = $ => {
$("section[id] > :header")
.filter((i, el) => el.name !== "h1")
.filter((i, el) => $(el).parents("pre[data-language]").length === 0) // don't process HTML inside pre
.replaceWith((i, el) => {
const $el = $(el);
return `<${el.name}>
<a class="anchor" href="#${$el
.parent()
.attr("id")}" aria-hidden="true">${anchorSvg}</a>${$el.html()}
</${el.name}>`;
});
return $;
};
const addIdsForIndexableFragments = $ => {
const existing = new Set();
forEachFullTextFragment($, $f => generateElementId($f, normalize, existing));
$(".warning, .info").each((i, e) =>
generateElementId($(e), normalize, existing)
);
return $;
};
/**
* Builds a table of contents JSON based on section nesting and headings.
*/
const createToc = $ => {
return $("article > section[id]")
.filter(notInPre($))
.map(function asToc(i, e) {
const $section = $(e);
const $subsections = $section.is("[data-toc='omit-children']")
? []
: $section
.children("section[id]")
.filter((i, el) => !$(el).is("[data-toc='omit']"));
return {
heading: $section.children(":header").eq(0).text(),
anchor: $section.attr("id"),
...($subsections.length > 0 && {
sections: $subsections.map(asToc).get()
})
};
})
.get();
};
const removeEmpty = a => {
return a.filter(e => !!e).join(" ");
};
const forEachFullTextFragment = ($, cb) => {
const elements = ["p", "li", "dt"];
const isIndexed = indexingAllowed($);
elements.forEach(tag => {
$(`${tag}`).each((i, e) => {
const $e = $(e);
// Don't index the first paragraph of figure
// caption, it's indexed as a figure heading.
if ($e.closest("figcaption").length > 0 && $e.is("p:first-child")) {
return;
}
// Don't index if a child of an element with data-indexing="disabled".
const withFlag = $e.closest("[data-indexing]");
if (withFlag.length > 0 && !isIndexed(0, withFlag.get(0))) {
return;
}
if (
$e.parents("[data-marker]").length > 0 ||
$e.find("[data-marker]").length > 0
) {
return;
}
cb($e);
$e.attr("data-marker", "");
});
});
$("[data-marker]").removeAttr("data-marker");
};
const getFigureCaption = $e => {
const $caption = $e.find("figcaption");
if ($caption.children().length > 0) {
return normalize($caption.children().eq(0).text());
} else {
return normalize($caption.text());
}
};
const headingExtractors = [
{
selector: "article",
type: "heading",
class: () => "section",
text: $e => $e.children(":header").text()
},
{
selector: "section[id]",
type: "heading",
class: () => "section",
text: $e => $e.children(":header").text()
},
{
selector: "code[id]",
type: "code",
class: () => "api",
text: $e => $e.text()
},
{
selector: "figure[id]",
type: "figure",
class: $e => {
if ($e.find("img, picture").length > 0) {
return "image";
}
if ($e.find("pre").length > 0) {
return "example";
}
return "figure";
},
text: getFigureCaption
},
{
selector: ".warning, .info",
type: "heading",
class: () => null,
text: $e => $e.find("strong").eq(0).text()
},
{
selector: "dt[id]",
type: "heading",
class: () => null,
text: $e => $e.text()
}
];
const normalize = t => {
return t.trim().replace(/(\s|\n)+/g, " ");
};
const collectIndexableFragments = $ => {
const isIndexed = indexingAllowed($);
const fragments = [];
if (!isIndexed(0, $("article"))) {
return fragments;
}
const extractParents = ($e, includeCaption) => {
const headings = $e
.parents("section, article, .warning, .info")
.children(":header, strong")
.map((i, heading) => normalize($(heading).text()))
.get()
.reverse();
// For paragraphs inside figure caption,
// add figure heading to the list of parents.
const $f = $e.closest("figure");
if (includeCaption && $f.length > 0) {
headings.push(getFigureCaption($f));
}
return headings;
};
headingExtractors.forEach(extractor => {
$(extractor.selector)
.filter(isIndexed)
.each((i, e) => {
const $e = $(e);
fragments.push({
text: normalize(extractor.text($e)),
type: extractor.type,
id: $e.attr("id") || "",
parents: extractParents($e, false),
class: removeEmpty([extractor.class($e), $e.attr("class")])
});
});
});
forEachFullTextFragment($, $f => {
fragments.push({
text: normalize($f.text()),
type: "paragraph",
id: $f.attr("id"),
parents: extractParents($f, true),
class: removeEmpty([$f.attr("class")])
});
});
return fragments;
};
// Gatsby API implementation
const onCreateNode = async ({
node,
actions,
loadNodeContent,
createNodeId,
createContentDigest
}) => {
const { createNode, createParentChildLink } = actions;
if (
node.internal.mediaType !== `text/html` ||
node.internal.type !== "File"
) {
return;
}
const rawHtml = await loadNodeContent(node);
let $ = loadHtml(rawHtml);
const htmlNode = {
rawHtml: rawHtml,
frontmatter: {
id: node.name,
title: normalize($("h1").eq(0).text())
},
id: createNodeId(`${node.id} >>> HTML`),
children: [],
parent: node.id,
dir: node.dir,
internal: {
contentDigest: createContentDigest(rawHtml),
type: "Html"
}
};
createNode(htmlNode);
createParentChildLink({ parent: node, child: htmlNode });
};
const tryCache = async (cache, prefix, key, produceEntry) => {
const k = `${prefix}:${key}`;
const cached = await cache.get(k);
if (cached) {
return cached;
} else {
const entry = await produceEntry();
// Let's be optimistic and not wait for writing to the cache?
cache.set(k, entry);
return entry;
}
};
const setFieldsOnGraphQLNodeType = (
{ type, getNodesByType, reporter, cache, pathPrefix, createContentDigest },
{ variables, transformers, finalizers, imageQuality = 90 }
) => {
if (type.name === "Html") {
const runTransformers = (fns, $, dir) => {
if (fns) {
for (let i = 0; i < fns.length; i++) {
$ = fns[i]($, {
dir,
variables,
reporter,
loadEmbeddedContent
});
}
}
return $;
};
const codeHighlighter = new CodeHighlighter();
const imageProcessor = new ImageProcessor({
getNodesByType, pathPrefix, imageQuality, reporter, cache
});
const svgInliner = new SvgInliner({ getNodesByType, variables, reporter});
return {
html: {
type: "String",
resolve: async node => {
return tryCache(cache, "html", node.internal.contentDigest, async () => {
// For correct highlighting of HTML code, we need to disable
// entity resolution in cheerio and then patch this in the
// serialized HTML, see fixClosingTagsInHighlightedCode() below.
let $ = loadHtml(node.rawHtml);
$ = runTransformers(transformers, $, node.dir);
$ = await svgInliner.transform($, node.dir);
$ = await imageProcessor.transform($, node.dir);
$ = rewriteInternalLinks($);
$ = addSectionAnchors($);
$ = embedCode($, node.dir, variables, reporter);
$ = codeHighlighter.transform($);
$ = addIdsForIndexableFragments($);
$ = runTransformers(finalizers, $, node.dir);
let rendered = renderHtml($);
rendered = replaceVariables(rendered, createMapReplacer(variables));
return rendered;
});
}
},
tableOfContents: {
type: GraphQLJSON,
resolve: async node => {
return tryCache(cache, "toc", node.internal.contentDigest, () => {
return createToc(loadHtml(node.rawHtml));
});
}
},
indexableFragments: {
type: GraphQLJSON,
resolve: async node => {
return tryCache(cache, "indexableFragments", node.internal.contentDigest, () => {
let $ = loadHtml(node.rawHtml);
$ = runTransformers(transformers, $, node.dir);
$ = addIdsForIndexableFragments($);
return collectIndexableFragments($);
});
}
}
};
}
};
const onPreBootstrap = ({ reporter }, { variables }) => {
try {
validateVariables(variables);
} catch (e) {
reporter.panic(e);
}
};
exports.onPreBootstrap = onPreBootstrap;
exports.onCreateNode = onCreateNode;
exports.setFieldsOnGraphQLNodeType = setFieldsOnGraphQLNodeType;