confluence-exporter-tool
Version:
Convert full Confluence Storage Format XHTML to GitHub Markdown for Foam
160 lines (133 loc) • 7.38 kB
JavaScript
/**
* Converts Confluence XML to Github Flavored Markdown format.
* @param {*} xml
* @returns
*/
export function convertXMLToMarkdown(xml) {
let output = xml;
// --- Macros ---
output = output.replace(/<ac:structured-macro ac:name="code">([\s\S]*?)<\/ac:structured-macro>/g, (match, inner) => {
const langMatch = inner.match(/<ac:parameter ac:name="language">([\s\S]*?)<\/ac:parameter>/);
const codeMatch = inner.match(/<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/);
const lang = langMatch ? langMatch[1] : '';
const code = codeMatch ? codeMatch[1].trim() : '';
return `\
\`\`\`${lang}\n${code}\n\`\`\``;
});
output = output.replace(/<ac:structured-macro ac:name="(note|info|warning|tip)">([\s\S]*?)<\/ac:structured-macro>/g, (match, macro, inner) => {
const codeMatch = inner.match(/<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/);
const code = codeMatch ? codeMatch[1].trim() : '';
return `> **${macro.toUpperCase()}**: ${code}`;
});
output = output.replace(/<ac:structured-macro ac:name="expand">([\s\S]*?)<\/ac:structured-macro>/g, (match, inner) => {
const codeMatch = inner.match(/<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/);
const code = codeMatch ? codeMatch[1].trim() : '';
return `> **Expand:** ${code}`;
});
output = output.replace(/<ac:structured-macro ac:name="([^"]+)">([\s\S]*?)<\/ac:structured-macro>/g, (match, macro) => {
return `> **[Unsupported macro: ${macro}]**`;
});
// --- Images (robust for whitespace and newlines) ---
output = output.replace(/<ac:image>\s*<ri:attachment ri:filename="([^"]+)"\s*\/?>\s*<\/ac:image>/g, (match, filename) => ``);
// --- Links (robust for whitespace and newlines) ---
output = output.replace(/<ac:link>\s*<ri:page ri:content-title="([^"]+)"\s*\/?>[\s\S]*?<\/ac:link>/g, (match, title) => `[[${title}]]`);
// --- Task lists ---
output = output.replace(/<ac:task>([\s\S]*?)<\/ac:task>/g, (match, inner) => {
const status = /<ac:task-status>complete<\/ac:task-status>/.test(inner) ? 'x' : ' ';
const bodyMatch = inner.match(/<ac:task-body>([\s\S]*?)<\/ac:task-body>/);
const body = bodyMatch ? bodyMatch[1].trim() : '';
return `- [${status}] ${body}`;
});
// --- Emojis ---
output = output.replace(/<ac:emoticon ac:name="([^"]+)"\/>/g, (match, name) => `:${name}:`);
// --- Placeholders ---
output = output.replace(/<ac:placeholder>([\s\S]*?)<\/ac:placeholder>/g, (match, text) => `_Instruction: ${text.trim()}_`);
// --- Text formatting ---
output = output.replace(/<(strong|b)>([\s\S]*?)<\/\1>/g, (m, t, text) => `**${text}**`);
output = output.replace(/<(em|i)>([\s\S]*?)<\/\1>/g, (m, t, text) => `*${text}*`);
output = output.replace(/<u\b[^>]*>([\s\S]*?)<\/u>/g, (m, text) => `<ins>${text}</ins>`);
output = output.replace(/<(code|tt)>([\s\S]*?)<\/\1>/g, (m, t, text) => '`' + text + '`');
output = output.replace(/<span style="[^"]*line-through[^"]*">([\s\S]*?)<\/span>/g, (m, text) => `~~${text}~~`);
output = output.replace(/<br\s*\/?>(\n)?/g, ' \n');
output = output.replace(/<hr\s*\/?>(\n)?/g, '\n---\n');
output = output.replace(/<sup>([\s\S]*?)<\/sup>/g, (m, text) => `^${text}^`);
output = output.replace(/<sub>([\s\S]*?)<\/sub>/g, (m, text) => `~${text}~`);
output = output.replace(/<(small|big)>([\s\S]*?)<\/\1>/g, (m, t, text) => text);
output = output.replace(/<blockquote>([\s\S]*?)<\/blockquote>/g, (m, text) => `> ${text.trim()}`);
// --- Headings ---
for (let i = 6; i >= 1; i--) {
const re = new RegExp(`<h${i}>([\\s\\S]*?)<\\/h${i}>`, 'g');
output = output.replace(re, (m, text) => `${'#'.repeat(i)} ${text}`);
}
// --- Paragraphs ---
output = output.replace(/<p>([\s\S]*?)<\/p>/g, (m, text) => `${text}\n`);
// --- Ordered and unordered lists with sequential numbers for ordered lists
output = output.replace(/<(ul|ol)>([\s\S]*?)<\/\1>/g, (m, t, inner) => {
const isOrdered = t === 'ol';
let idx = 1;
return inner.replace(/<li>([\s\S]*?)<\/li>/g, (m, text) => {
if (isOrdered) {
return `${idx++}. ${text}\n`;
} else {
return `- ${text}\n`;
}
});
});
// --- Tables ---
output = output.replace(/<table>([\s\S]*?)<\/table>/g, (m, inner) => {
let markdown = '';
const rowMatches = inner.match(/<tr>[\s\S]*?<\/tr>/g) || [];
rowMatches.forEach((row, i) => {
const cellMatches = row.match(/<(th|td)>([\s\S]*?)<\/\1>/g) || [];
let line = '';
cellMatches.forEach(cell => {
const text = cell.replace(/<\/?(th|td)>/g, '').trim();
line += '| ' + text + ' ';
});
line += '|\n';
markdown += line;
if (i === 0) {
markdown += '| ' + '--- |'.repeat(cellMatches.length) + '\n';
}
});
return markdown;
});
// --- Layouts ---
output = output.replace(/<ac:layout[\s\S]*?>|<\/ac:layout>|<ac:layout-section[\s\S]*?>|<\/ac:layout-section>|<ac:layout-cell[\s\S]*?>|<\/ac:layout-cell>/g, '');
// --- Template variables ---
output = output.replace(/<at:var at:name="([^"]+)"\/>/g, (m, name) => `{{${name}}}`);
// --- Attachments ---
output = output.replace(/<ri:attachment ri:filename="([^"]+)"\/>/g, (m, filename) => `[${filename}](./${filename})`);
// --- Emojis (self-closing) ---
output = output.replace(/<ac:emoticon ac:name="([^"]+)"\s*\/>/g, (match, name) => `:${name}:`);
// --- Template variables (self-closing) ---
output = output.replace(/<at:var at:name="([^"]+)"\s*\/>/g, (m, name) => `{{${name}}}`);
// --- Attachments (self-closing) ---
output = output.replace(/<ri:attachment ri:filename="([^"]+)"\s*\/>/g, (m, filename) => `[${filename}](./${filename})`);
// --- Images (robust for whitespace and newlines) ---
output = output.replace(/<ac:image>\s*<ri:attachment ri:filename="([^"]+)"\s*\/?>\s*<\/ac:image>/g, (match, filename) => ``);
// --- Links (robust for whitespace and newlines) ---
output = output.replace(/<ac:link>\s*<ri:page ri:content-title="([^"]+)"\s*\/?>[\s\S]*?<\/ac:link>/g, (match, title) => `[[${title}]]`);
// --- Advanced images (external URL) ---
output = output.replace(/<ac:image>\s*<ri:url ri:value="([^"]+)"\s*\/?>\s*<\/ac:image>/g, (match, url) => ``);
// --- Underline: output as HTML ---
output = output.replace(/<u>([\s\S]*?)<\/u>/g, (m, text) => `<ins>${text}</ins>`);
// --- Preformatted: handle after tag stripping ---
output = output.replace(/<pre>([\s\S]*?)<\/pre>/g, (m, text) => {
const clean = text.replace(/^\s+|\s+$/g, ''); // trim all leading/trailing whitespace
return `\`\`\`\n${clean}\n\`\`\``;
});
// --- External links: handle after tag stripping ---
output = output.replace(/<a\s+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/g, (m, href, text) => `[${text}](${href})`);
// --- Remove any remaining tags except <ins> and </ins> ---
output = output.replace(/<(?!\/?ins\b)[^>]+>/g, '');
// --- HTML Entities ---
output = output.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/ /g, '\u00A0');
return output.trim();
}
export default { convertXMLToMarkdown };