UNPKG

excerpt-html

Version:

Get the Excerpt from a markdown file (like in jekyll or *smith)

github.com/martinheidegger/excerpt-html

martinheidegger/excerpt-html

101 lines (93 loc) • 3.65 kB

JavaScript

'use strict' /** * Extracted from https://github.com/simbo/metalsmith-better-excerpts * (published under MIT license) */ const cheerio = require('cheerio') const unescapeHTML = require('he').unescape const stripTags = require('striptags') const truncate = require('lodash.truncate') /** * retrieve excerpt from file object by extracting contents until a 'more' tag * @param {string} html file object * @param {RegExp} regExp 'more' tag regexp * @return {string} excerpt string or undefined */ function getExcerptByMoreTag (html, regExp) { html = cheerio.load('<root>' + html + '</root>')('root').html() const match = html.search(regExp) if (match > -1) { const excerpt = html.slice(0, Buffer.byteLength(html.slice(0, match))) return unescapeHTML(excerpt) } } /** * retrieve excerpt from file object by extracting the first p's contents * @param {string} html file object * @return {string} excerpt string */ function getExcerptByFirstParagraph (html) { const $ = cheerio.load(html) const isEmpty = element => $(element).text().trim().length === 0 const p = $('p').filter( (_index, element) => !isEmpty(element) ).first() const excerpt = p.length ? p.html().trim() : html return unescapeHTML(excerpt) } /** * @param {string} excerpt Already extracted excerpt * @param {Object} options stripping options * @param {number} [options.pruneLength] * @param {string} [options.pruneSeparator] * @param {string} [options.pruneString] * @return {string} The striped and pruned excerpt */ function stripTagsFromExcerpt (excerpt, options) { excerpt = stripTags(excerpt) excerpt = excerpt.replace(/^\s+|\s+$|\s+(?=\s)/g, '') const pruneLength = typeof options.pruneLength === 'number' ? options.pruneLength : 140 if (pruneLength > 0) { excerpt = truncate(excerpt, { length: pruneLength, omission: typeof options.pruneString === 'string' ? options.pruneString : '…', separator: typeof options.pruneSeparator === 'string' ? options.pruneSeparator : ' ' }) } return excerpt } /** * Extracts the raw excerpt (without stripped tags) from the html * * @param {string} html Html string to look for the excerpt * @param {RegExp} [moreRegExp=/\s*/i] RegExp used to look for the end of the excerpt * @return If found, the excerpt from the more tag, else the excerpt contained in the first <p></p> */ function getRawExcerpt (html, moreRegExp) { if (!moreRegExp) { moreRegExp = /\s*/i } return getExcerptByMoreTag(html, moreRegExp) || getExcerptByFirstParagraph(html) } /** * Parses the excerpt for a given html string. * * @param {string} html Html code to parse for the excerpt. * @param {Object} [options] Options for parsing. * @param {RegExp} [options.moreRegExp=/\s*/i] Regexp to look for the end of the excerpt. If this is not found * @param {boolean} [options.stripTags=true] Strip the tags from the html code when getting the excerpt. * @param {number} [options.pruneLength=140] Maximum size of the excerpt (only functional if stripTags=true) * @param {string} [options.pruneSeparator=' '] Character to look for when truncating a text * @param {string} [options.pruneString='…'] String to be attached if pruning needs to happen * @returns {string} The excerpt found in the given html code. */ module.exports = function excerptHtml (html, options) { if (!options) { options = {} } const rawExcerpt = getRawExcerpt(html, options.moreRegExp) if (options.stripTags === false) { return rawExcerpt } return stripTagsFromExcerpt(rawExcerpt, options) }