shamela
Version:
Library to interact with the Maktabah Shamela v4 APIs
137 lines (136 loc) • 5.73 kB
TypeScript
import { d as NormalizeTitleSpanOptions } from "./types-CeDA67OZ.js";
//#region src/content.d.ts
type Line = {
id?: string;
text: string;
};
/**
* Normalizes line endings to Unix-style (`\n`).
*
* Converts Windows (`\r\n`) and old Mac (`\r`) line endings to Unix style
* for consistent pattern matching across platforms.
*
* @param content - Raw content with potentially mixed line endings
* @returns Content with all line endings normalized to `\n`
*/
declare const normalizeLineEndings: (content: string) => string;
/**
* Parses Shamela HTML content into structured lines while preserving headings.
*
* @param content - The raw HTML markup representing a page
* @returns An array of {@link Line} objects containing text and optional IDs
*/
declare const parseContentRobust: (content: string) => Line[];
/**
* Sanitises page content by applying regex replacement rules.
*
* @param text - The text to clean
* @param rules - Optional custom replacements, defaults to {@link DEFAULT_MAPPING_RULES}
* @returns The sanitised content
*/
declare const mapPageCharacterContent: (text: string, rules?: Record<string, string>) => string;
/**
* Splits a page body from its trailing footnotes using a marker string.
*
* @param content - Combined body and footnote text
* @param footnoteMarker - Marker indicating the start of footnotes
* @returns A tuple containing the page body followed by the footnote section
*/
declare const splitPageBodyFromFooter: (content: string, footnoteMarker?: string) => readonly [string, string];
/**
* Removes Arabic numeral page markers enclosed in turtle ⦗ ⦘ brackets.
* Replaces the marker along with up to two preceding whitespace characters
* (space or carriage return) and up to one following whitespace character
* with a single space.
*
* @param text - Text potentially containing page markers
* @returns The text with numeric markers replaced by a single space
*/
declare const removeArabicNumericPageMarkers: (text: string) => string;
/**
* Removes anchor and hadeeth tags from the content while preserving spans.
*
* @param content - HTML string containing various tags
* @returns The content with only span tags retained
*/
declare const removeTagsExceptSpan: (content: string) => string;
/**
* Normalizes Shamela HTML for CSS styling:
* - Converts <hadeeth-N> to <span class="hadeeth">
* - Converts </hadeeth> or standalone <hadeeth> to </span>
*/
declare const normalizeHtml: (html: string) => string;
/**
* Strip all HTML tags from content, keeping only text.
*
* @param html - HTML content
* @returns Plain text content
*/
declare const stripHtmlTags: (html: string) => string;
/**
* Moves content that appears after a line break but before a title span into the span.
*
* This handles cases where text at the start of a line (such as chapter numbers like "١ -")
* should logically be part of the following title but was placed outside the span in the HTML.
*
* @example
* ```typescript
* // Input: "\rباب الأول<span data-type="title">العنوان</span>"
* // Output: "\r<span data-type="title">باب الأول العنوان</span>"
* ```
*
* @param html - HTML content with potential pre-title text
* @returns HTML with pre-title text moved inside title spans
*/
declare const moveContentAfterLineBreakIntoSpan: (html: string) => string;
/**
* Convert Shamela HTML to Markdown format for easier pattern matching.
*
* Transformations:
* - `<span data-type="title">text</span>` → `## text`
* - `<a href="inr://...">text</a>` → `text` (strip narrator links)
* - All other HTML tags → stripped
*
* Note: Content typically already has proper line breaks before title spans,
* so we don't add extra newlines around the ## header.
* Line ending normalization is handled by segmentPages.
*
* @param html - HTML content from Shamela
* @returns Markdown-formatted content
*/
declare const htmlToMarkdown: (html: string) => string;
/**
* Normalizes consecutive Shamela-style title spans.
*
* Shamela exports sometimes contain adjacent title spans like:
* `<span data-type="title">باب الميم</span><span data-type="title">من اسمه محمد</span>`
*
* If you naively convert each title span into a markdown heading, you can end up with:
* `## باب الميم ## من اسمه محمد` (two headings on one line).
*
* This helper rewrites the HTML so downstream HTML→Markdown conversion can stay simple and consistent.
*/
declare const normalizeTitleSpans: (html: string, options: NormalizeTitleSpanOptions) => string;
/**
* Converts Shamela HTML content to Markdown format using a standardized pipeline.
*
* This is a convenience function that applies the recommended sequence of transformations:
* 1. Normalizes consecutive title spans (default: splitLines strategy)
* 2. Moves pre-title text into spans
* 3. Converts to Markdown format
*
* @example
* ```typescript
* const html = '<span data-type="title">Chapter</span><span data-type="title">One</span>';
* const markdown = convertContentToMarkdown(html);
* // => "## Chapter\n## One"
* ```
*
* @param content - Raw HTML content from Shamela
* @param options - Optional configuration for title span normalization
* @returns Markdown-formatted content
*/
declare const convertContentToMarkdown: (content: string, options?: NormalizeTitleSpanOptions) => string;
//#endregion
export { Line, convertContentToMarkdown, htmlToMarkdown, mapPageCharacterContent, moveContentAfterLineBreakIntoSpan, normalizeHtml, normalizeLineEndings, normalizeTitleSpans, parseContentRobust, removeArabicNumericPageMarkers, removeTagsExceptSpan, splitPageBodyFromFooter, stripHtmlTags };
//# sourceMappingURL=content.d.ts.map