UNPKG

hyperpubee

Version:

Self-publishing over the decentralised internet

420 lines (336 loc) 11.7 kB
const PubeeBuilder = require('./pubee-builder') const { LineCreator, PoemCreator, WorkCreator, TitleCreator, ContentCreator, VerseCreator, EmbeddingCreator, ChapterCreator, ChapterTitleCreator, ParagraphCreator, ProseCreator, CollectionCreator, LinkCreator } = require('./part-creation') const { HYPERHASH_REGEX_STR, KEYNAME_REGEX_STR, HYPERHASH_REGEX, METADATA_AUTHOR } = require('../core/constants') const { Embedding } = require('../core/embedding') const { NotAnIntegerError, EmbeddingValidationError } = require('../exceptions') const { getIndexedKeyFromComponents } = require('../core/utils') const { Link } = require('../core/link') const NEWLINE_REGEX = /\r\n|\r|\n/g // Note that the matching of the patch is extremely unstrict // Any errors there are expected to be detected when validating the embedding // Note: will fail on patches containing the string ']]' const PATCH_REGEX_STR = String.raw`\[\[.*?\]\]` // DEVNOTE: '(?:' is a non-capturing group // (not captured since we don't care about the sub-parts of a location) const LOCATION_REGEX_STR = String.raw`(?:${KEYNAME_REGEX_STR}/)+\d+` const EMBEDDING_REGEX = new RegExp( String.raw`<(${HYPERHASH_REGEX_STR})/(${LOCATION_REGEX_STR})(${PATCH_REGEX_STR})?>`, 'g' ) const LINK_REGEX = new RegExp( String.raw`^\[(?<hash>${HYPERHASH_REGEX_STR})(/(?<location>${LOCATION_REGEX_STR}))?\]$` ) const CHAPTER_START_REGEX = /\n(?=##)/ const WorkTypeEnum = Object.freeze({ POETRY: Symbol('HyperPubee poetry'), PROSE: Symbol('HyperPubee prose'), COLLECTION: Symbol('HyperPubee collection') }) function trimAndCleanText (text) { text = text.trim(NEWLINE_REGEX) const lines = text.split(NEWLINE_REGEX) const newLines = [] for (const line of lines) { const trimmedLine = line.trim() // DEVNOTE: only trim whitespace of lines which are completely blank // (containing only whitespace) if (trimmedLine === '') { newLines.push(trimmedLine) } else { newLines.push(line) } } return newLines.join('\n') } function parseTitle (workTxt, creators) { // Note: Assumes the text is clean // (as after running cleanNewlinesAndTrimWhiteSpace) let title let nonTitleText = workTxt if (workTxt[0] === '#') { // Title found const endOfFirstLineI = workTxt.indexOf('\n') title = workTxt.slice(1, endOfFirstLineI).trim() // Throws an error when no or more than 1 blank line is found // after the perceived title, to avoid unexpected behaviour if ( workTxt[endOfFirstLineI + 1] !== '\n' || workTxt[endOfFirstLineI + 2] === '\n' ) { throw Error( `Expected exactly one blank line after the title (extracted title: '${title}')` ) } nonTitleText = workTxt.slice(endOfFirstLineI + 2) } if (title !== undefined) { const titleContent = new ContentCreator(title) const titleCreator = new TitleCreator([titleContent]) creators.push(titleCreator) } return nonTitleText } function getLocationInHyperpubeeFormat (location) { const resParts = location.split('/') try { return getIndexedKeyFromComponents(resParts) } catch (e) { if (e instanceof NotAnIntegerError) { throw new Error( `Invalid location: expected an integer at the end (${location})` ) } else { throw e } } } function getPatchInHyperpubeeFormat (rawPatch) { let patch = [] if (rawPatch !== undefined) { try { patch = JSON.parse(rawPatch) } catch (e) { throw new EmbeddingValidationError(`Patch has invalid structure: '${rawPatch}'`) } } return patch } function parseTextUnit (unit) { // Parsing base text units (lines in poetry, paragraphs in prose) // There are 2 groups per matched embedding: // 1) the hash // 2) the reference within the hash // Note that the split method splices in the matched groups // The strategy for processing embeddings will be to detect elements // of the split line which are hypercore hashes (the first matched group) const unitSplitByEmbeddings = unit.split(EMBEDDING_REGEX) const subCreators = [] let i = 0 while (i < unitSplitByEmbeddings.length) { const unitPart = unitSplitByEmbeddings[i] if (unitPart.match(HYPERHASH_REGEX)) { // This is an embedding hash, and the next 2 elements the relative // location within the hash and the patch (which can be undefined) // (Note that we ignore the extremely unlikely case that just before or after // an actual embedding there were 64+ consecutive characters // accidentally matching the hyperhash-detection regex) const rawLoc = unitSplitByEmbeddings[i + 1] const referencedLocation = getLocationInHyperpubeeFormat(rawLoc) const rawPatch = unitSplitByEmbeddings[i + 2] const patch = getPatchInHyperpubeeFormat(rawPatch) const embedding = new Embedding({ referencedHash: unitPart, referencedLocation, patch }) subCreators.push(new EmbeddingCreator(embedding)) i += 3 // Skip the referenced location and patch groups } else { // Normal (non-embedding) content if (unitPart.length > 0) { subCreators.push(new ContentCreator(unitPart)) } i += 1 } } return subCreators } function parseVerse (verseAsText) { const lineCreators = [] const lines = verseAsText.split('\n') for (const line of lines) { const subCreators = parseTextUnit(line, lineCreators) const lineCreator = new LineCreator(subCreators) lineCreators.push(lineCreator) } const verseCreator = new VerseCreator(lineCreators) return verseCreator } function splitInChapterBodies (text) { const chapterBodies = text.split(CHAPTER_START_REGEX) if (chapterBodies.length === 1) { // Insert an artificial chapter when no chapters found // (note: poems with one chapter are not assumed to exist) if (chapterBodies[0].slice(0, 2) === '##') { throw new Error( `Text with only 1 chapter--do not use chapters then\n${chapterBodies}` ) } chapterBodies[0] = ['##\n\n', chapterBodies[0]].join('') } else if (chapterBodies[0].slice(0, 2) !== '##') { throw new Error( 'If a text consists of chapters, it should not contain text before the first chapter ("##")' ) } return chapterBodies } function parseChapterTitle (chapterBody, chapterCreators) { const splitChapterBody = chapterBody.split('\n') const chapterTitle = splitChapterBody[0].slice(2).trim() if (chapterTitle !== '') { // Empty string => no title const content = new ContentCreator(chapterTitle) const chapterTitleCreator = new ChapterTitleCreator([content]) chapterCreators.push(chapterTitleCreator) } // Throws an error when no or more than 1 blank line is found // after the perceived title, to avoid unexpected behaviour if (splitChapterBody[1] !== '' || splitChapterBody[2] === '\n') { throw Error( `Expected exactly one blank line after the start of a new chapter)\n${chapterBody}` ) } const remainingChapterBody = splitChapterBody.slice(2).join('\n') return remainingChapterBody } function parseChapter (chapterBody, isPoem) { const subCreators = [] const remainingChapterBody = parseChapterTitle(chapterBody, subCreators) // The trim removes any potential dangling new lines after the last // verse of the chapter const constituents = remainingChapterBody.trim().split('\n\n') if (isPoem) { for (const verse of constituents) { const verseCreator = parseVerse(verse) subCreators.push(verseCreator) } } else { // Prose for (const paragraph of constituents) { const paragraphComponents = parseTextUnit(paragraph) subCreators.push(new ParagraphCreator(paragraphComponents)) } } return subCreators } function parseChapters (chapterBodies, isPoem) { const parsedChapters = [] for (const chapterBody of chapterBodies) { if (chapterBody.slice(0, 2) !== '##') { throw new Error( [ "Logical bug in parse code: chapter expected to start with '##'", chapterBody ].join('--Chapter body (starting at next line): \n') ) } const chapterSubCreators = parseChapter(chapterBody, isPoem) parsedChapters.push(chapterSubCreators) } return parsedChapters } function makePoemOrProseCreator (parsedChapters, isPoem) { let components = [] if (parsedChapters.length === 1) { // 1 chapter => work with the bare verses components = [...parsedChapters[0]] } else { for (const parsedChapter of parsedChapters) { const chapterCreator = new ChapterCreator(parsedChapter) components.push(chapterCreator) } } if (isPoem) { return new PoemCreator(components) } else { return new ProseCreator(components) } } function parseWorkBody ({ body, isPoem, allCreators }) { const chapterBodies = splitInChapterBodies(body) const parsedChapters = parseChapters(chapterBodies, isPoem) const workTypeCreator = makePoemOrProseCreator(parsedChapters, isPoem) allCreators.push(workTypeCreator) } async function createWork ({ bee, workCreator, metadata }) { const pubeeBuilder = new PubeeBuilder(bee) await workCreator.addToPubeeBuilder(pubeeBuilder) await pubeeBuilder.addMetadata(metadata) const pubee = await pubeeBuilder.build() return pubee } function extractWorkType (text) { let cleanedText = text.trim() let workType if (cleanedText.slice(0, 7) === '[prose]') { workType = WorkTypeEnum.PROSE cleanedText = cleanedText.slice(7) } else if (cleanedText.slice(0, 8) === '[poetry]') { workType = WorkTypeEnum.POETRY cleanedText = cleanedText.slice(8) } else if (cleanedText.slice(0, 12) === '[collection]') { workType = WorkTypeEnum.COLLECTION cleanedText = cleanedText.slice(12) } else { throw new Error( "A text should start with either '[poetry]', '[prose]' or '[collection]' to indicate its type" ) } return [workType, cleanedText] } function createMetadata (author) { const metadata = {} if (author != null) { metadata[METADATA_AUTHOR] = author } return metadata } function parseLineWithLink (line) { const cleanedLine = line.trim() const match = cleanedLine.match(LINK_REGEX) if (match === null) { throw new Error(`Invalid link: '${cleanedLine}'`) } let location if (match.groups.location !== undefined) { location = getLocationInHyperpubeeFormat(match.groups.location) } return new Link({ linkedHash: match.groups.hash, linkedLocation: location }) } function parseCollectionBody (body, allCreators) { const lines = body.split(NEWLINE_REGEX) const linkCreators = [] for (const line of lines) { const link = parseLineWithLink(line) const linkCreator = new LinkCreator(link) linkCreators.push(linkCreator) } const collectionCreator = new CollectionCreator(linkCreators) allCreators.push(collectionCreator) } async function parseWork ({ text, bee, author }) { const metadata = createMetadata(author) const [workType, workText] = extractWorkType(text) const cleanedText = trimAndCleanText(workText) const allCreators = [] const body = parseTitle(cleanedText, allCreators) if (workType === WorkTypeEnum.COLLECTION) { parseCollectionBody(body, allCreators) } else { const isPoem = workType === WorkTypeEnum.POETRY parseWorkBody({ body, isPoem, allCreators }) } const workCreator = new WorkCreator(allCreators) const pubee = createWork({ bee, workCreator, metadata }) return pubee } module.exports = { parseWork }