UNPKG

shamela

Version:

Library to interact with the Maktabah Shamela v4 APIs

137 lines (136 loc) 5.73 kB
import { d as NormalizeTitleSpanOptions } from "./types-CeDA67OZ.js"; //#region src/content.d.ts type Line = { id?: string; text: string; }; /** * Normalizes line endings to Unix-style (`\n`). * * Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style * for consistent pattern matching across platforms. * * @param content - Raw content with potentially mixed line endings * @returns Content with all line endings normalized to `\n` */ declare const normalizeLineEndings: (content: string) => string; /** * Parses Shamela HTML content into structured lines while preserving headings. * * @param content - The raw HTML markup representing a page * @returns An array of {@link Line} objects containing text and optional IDs */ declare const parseContentRobust: (content: string) => Line[]; /** * Sanitises page content by applying regex replacement rules. * * @param text - The text to clean * @param rules - Optional custom replacements, defaults to {@link DEFAULT_MAPPING_RULES} * @returns The sanitised content */ declare const mapPageCharacterContent: (text: string, rules?: Record<string, string>) => string; /** * Splits a page body from its trailing footnotes using a marker string. * * @param content - Combined body and footnote text * @param footnoteMarker - Marker indicating the start of footnotes * @returns A tuple containing the page body followed by the footnote section */ declare const splitPageBodyFromFooter: (content: string, footnoteMarker?: string) => readonly [string, string]; /** * Removes Arabic numeral page markers enclosed in turtle ⦗ ⦘ brackets. * Replaces the marker along with up to two preceding whitespace characters * (space or carriage return) and up to one following whitespace character * with a single space. * * @param text - Text potentially containing page markers * @returns The text with numeric markers replaced by a single space */ declare const removeArabicNumericPageMarkers: (text: string) => string; /** * Removes anchor and hadeeth tags from the content while preserving spans. * * @param content - HTML string containing various tags * @returns The content with only span tags retained */ declare const removeTagsExceptSpan: (content: string) => string; /** * Normalizes Shamela HTML for CSS styling: * - Converts <hadeeth-N> to <span class="hadeeth"> * - Converts </hadeeth> or standalone <hadeeth> to </span> */ declare const normalizeHtml: (html: string) => string; /** * Strip all HTML tags from content, keeping only text. * * @param html - HTML content * @returns Plain text content */ declare const stripHtmlTags: (html: string) => string; /** * Moves content that appears after a line break but before a title span into the span. * * This handles cases where text at the start of a line (such as chapter numbers like "١ -") * should logically be part of the following title but was placed outside the span in the HTML. * * @example * ```typescript * // Input: "\rباب الأول<span data-type="title">العنوان</span>" * // Output: "\r<span data-type="title">باب الأول العنوان</span>" * ``` * * @param html - HTML content with potential pre-title text * @returns HTML with pre-title text moved inside title spans */ declare const moveContentAfterLineBreakIntoSpan: (html: string) => string; /** * Convert Shamela HTML to Markdown format for easier pattern matching. * * Transformations: * - `<span data-type="title">text</span>` → `## text` * - `<a href="inr://...">text</a>` → `text` (strip narrator links) * - All other HTML tags → stripped * * Note: Content typically already has proper line breaks before title spans, * so we don't add extra newlines around the ## header. * Line ending normalization is handled by segmentPages. * * @param html - HTML content from Shamela * @returns Markdown-formatted content */ declare const htmlToMarkdown: (html: string) => string; /** * Normalizes consecutive Shamela-style title spans. * * Shamela exports sometimes contain adjacent title spans like: * `<span data-type="title">باب الميم</span><span data-type="title">من اسمه محمد</span>` * * If you naively convert each title span into a markdown heading, you can end up with: * `## باب الميم ## من اسمه محمد` (two headings on one line). * * This helper rewrites the HTML so downstream HTML→Markdown conversion can stay simple and consistent. */ declare const normalizeTitleSpans: (html: string, options: NormalizeTitleSpanOptions) => string; /** * Converts Shamela HTML content to Markdown format using a standardized pipeline. * * This is a convenience function that applies the recommended sequence of transformations: * 1. Normalizes consecutive title spans (default: splitLines strategy) * 2. Moves pre-title text into spans * 3. Converts to Markdown format * * @example * ```typescript * const html = '<span data-type="title">Chapter</span><span data-type="title">One</span>'; * const markdown = convertContentToMarkdown(html); * // => "## Chapter\n## One" * ``` * * @param content - Raw HTML content from Shamela * @param options - Optional configuration for title span normalization * @returns Markdown-formatted content */ declare const convertContentToMarkdown: (content: string, options?: NormalizeTitleSpanOptions) => string; //#endregion export { Line, convertContentToMarkdown, htmlToMarkdown, mapPageCharacterContent, moveContentAfterLineBreakIntoSpan, normalizeHtml, normalizeLineEndings, normalizeTitleSpans, parseContentRobust, removeArabicNumericPageMarkers, removeTagsExceptSpan, splitPageBodyFromFooter, stripHtmlTags }; //# sourceMappingURL=content.d.ts.map