UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

github.com/kepano/defuddle

kepano/defuddle

293 lines • 10.7 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.codeBlockRules = void 0; const utils_1 = require("../utils"); // Language patterns const HIGHLIGHTER_PATTERNS = [ /^language-(\w+)$/, // language-javascript /^lang-(\w+)$/, // lang-javascript /^(\w+)-code$/, // javascript-code /^code-(\w+)$/, // code-javascript /^syntax-(\w+)$/, // syntax-javascript /^code-snippet__(\w+)$/, // code-snippet__javascript /^highlight-(\w+)$/, // highlight-javascript /^(\w+)-snippet$/, // javascript-snippet // fallback /(?:^|\s)(?:language|lang|brush|syntax)-(\w+)(?:\s|$)/i ]; // Languages to detect in code blocks const CODE_LANGUAGES = new Set([ 'abap', 'actionscript', 'ada', 'adoc', 'agda', 'antlr4', 'applescript', 'arduino', 'armasm', 'asciidoc', 'aspnet', 'atom', 'bash', 'batch', 'c', 'clojure', 'cmake', 'cobol', 'coffeescript', 'cpp', 'c++', 'crystal', 'csharp', 'cs', 'dart', 'django', 'dockerfile', 'dotnet', 'elixir', 'elm', 'erlang', 'fortran', 'fsharp', 'gdscript', 'gitignore', 'glsl', 'golang', 'gradle', 'graphql', 'groovy', 'haskell', 'hs', 'haxe', 'hlsl', 'html', 'idris', 'java', 'javascript', 'js', 'jsx', 'jsdoc', 'json', 'jsonp', 'julia', 'kotlin', 'latex', 'lisp', 'elisp', 'livescript', 'lua', 'makefile', 'markdown', 'md', 'markup', 'masm', 'mathml', 'matlab', 'mongodb', 'mysql', 'nasm', 'nginx', 'nim', 'nix', 'objc', 'ocaml', 'pascal', 'perl', 'php', 'postgresql', 'powershell', 'prolog', 'puppet', 'python', 'regex', 'rss', 'ruby', 'rb', 'rust', 'scala', 'scheme', 'shell', 'sh', 'solidity', 'sparql', 'sql', 'ssml', 'svg', 'swift', 'tcl', 'terraform', 'tex', 'toml', 'typescript', 'ts', 'tsx', 'unrealscript', 'verilog', 'vhdl', 'webassembly', 'wasm', 'xml', 'yaml', 'yml', 'zig' ]); // Convert code blocks with different syntax highlighters and line numbers // to a standard <pre> and <code> element with a language attribute exports.codeBlockRules = [ { selector: [ // Basic code blocks 'pre', // Common syntax highlighter containers 'div[class*="prismjs"]', '.syntaxhighlighter', '.highlight', '.highlight-source', '.wp-block-syntaxhighlighter-code', '.wp-block-code', 'div[class*="language-"]' ].join(', '), element: 'pre', transform: (el, doc) => { // Helper function to check if an element has specific properties const hasHTMLElementProps = (el) => { return 'classList' in el && 'getAttribute' in el && 'querySelector' in el; }; if (!hasHTMLElementProps(el)) return el; const getCodeLanguage = (element) => { // Check data-lang attribute first const dataLang = element.getAttribute('data-lang') || element.getAttribute('data-language'); if (dataLang) { return dataLang.toLowerCase(); } // Check class names for patterns and supported languages const classNames = Array.from(element.classList || []); // Check for syntax highlighter specific format if (element.classList?.contains('syntaxhighlighter')) { const langClass = classNames.find(c => !['syntaxhighlighter', 'nogutter'].includes(c)); if (langClass && CODE_LANGUAGES.has(langClass.toLowerCase())) { return langClass.toLowerCase(); } } // Check patterns for (const className of classNames) { for (const pattern of HIGHLIGHTER_PATTERNS) { const match = className.toLowerCase().match(pattern); if (match && match[1] && CODE_LANGUAGES.has(match[1].toLowerCase())) { return match[1].toLowerCase(); } } } // If all else fails, check for bare language names for (const className of classNames) { if (CODE_LANGUAGES.has(className.toLowerCase())) { return className.toLowerCase(); } } return ''; }; // Try to get the language from the element and its ancestors let language = ''; let currentElement = el; while (currentElement && !language) { language = getCodeLanguage(currentElement); // Also check for code elements within the current element const codeEl = currentElement.querySelector('code'); if (!language && codeEl) { language = getCodeLanguage(codeEl); } currentElement = currentElement.parentElement; } // Extract content from WordPress syntax highlighter const extractWordPressContent = (element) => { // Handle WordPress syntax highlighter table format const codeContainer = element.querySelector('.syntaxhighlighter table .code .container'); if (codeContainer) { return Array.from(codeContainer.children) .map(line => { const codeParts = Array.from(line.querySelectorAll('code')) .map(code => { let text = code.textContent || ''; if (code.classList?.contains('spaces')) { text = ' '.repeat(text.length); } return text; }) .join(''); return codeParts || line.textContent || ''; }) .join('\n'); } // Handle WordPress syntax highlighter non-table format const codeLines = element.querySelectorAll('.code .line'); if (codeLines.length > 0) { return Array.from(codeLines) .map(line => { const codeParts = Array.from(line.querySelectorAll('code')) .map(code => code.textContent || '') .join(''); return codeParts || line.textContent || ''; }) .join('\n'); } return ''; }; // Recursively extract text content while preserving structure const extractStructuredText = (element) => { if ((0, utils_1.isTextNode)(element)) { return element.textContent || ''; } let text = ''; if ((0, utils_1.isElement)(element)) { // Handle explicit line breaks if (element.tagName === 'BR') { return '\n'; } // Handle common line-based code formats // This covers various syntax highlighter implementations that use // divs or spans to represent individual lines if (element.matches('div[class*="line"], span[class*="line"], .ec-line, [data-line-number], [data-line]')) { // Try to find the actual code content in common structures: // 1. A dedicated code container const codeContainer = element.querySelector('.code, .content, [class*="code-"], [class*="content-"]'); if (codeContainer) { return (codeContainer.textContent || '') + '\n'; } // 2. Line number is in a separate element const lineNumber = element.querySelector('.line-number, .gutter, [class*="line-number"], [class*="gutter"]'); if (lineNumber) { const withoutLineNum = Array.from(element.childNodes) .filter(node => !lineNumber.contains(node)) .map(node => extractStructuredText(node)) .join(''); return withoutLineNum + '\n'; } // 3. Fallback to the entire line content return element.textContent + '\n'; } element.childNodes.forEach(child => { text += extractStructuredText(child); }); } return text; }; // Extract content based on element type let codeContent = ''; if (el.matches('.syntaxhighlighter, .wp-block-syntaxhighlighter-code')) { codeContent = extractWordPressContent(el); } // If no content extracted from WordPress format, use structured text extraction if (!codeContent) { codeContent = extractStructuredText(el); } // Clean up the content codeContent = codeContent .replace(/^\s+|\s+$/g, '') // Trim start/end whitespace .replace(/\t/g, ' ') // Convert tabs to spaces .replace(/\n{3,}/g, '\n\n') // Normalize multiple newlines .replace(/\u00a0/g, ' ') // Replace non-breaking spaces .replace(/^\n+/, '') // Remove extra newlines at start .replace(/\n+$/, ''); // Remove extra newlines at end // Create new pre element const newPre = doc.createElement('pre'); // Create code element const code = doc.createElement('code'); if (language) { code.setAttribute('data-lang', language); code.setAttribute('class', `language-${language}`); } code.textContent = codeContent; newPre.appendChild(code); return newPre; } } ]; //# sourceMappingURL=code.js.map