@adobe/spacecat-shared-html-analyzer
Version:
Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content
135 lines (119 loc) • 5.61 kB
JavaScript
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
/**
* Content analysis and metrics calculation
* Provides comprehensive analysis of HTML content differences
*/
import { stripTagsToText } from './html-filter.js';
import { tokenize } from './tokenizer.js';
import { generateDiffReport } from './diff-engine.js';
import { hashDJB2, pct } from './utils.js';
/**
* Comprehensive text-only analysis between initial and final HTML
* @param {string} initHtml - Initial HTML content (what crawlers see)
* @param {string} finHtml - Final HTML content (what users see)
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
* @returns {Promise<Object>} Comprehensive analysis results
*/
export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter = true) {
// Handle both sync (browser) and async (Node.js) stripTagsToText
const initTextResult = stripTagsToText(initHtml, ignoreNavFooter);
const finTextResult = stripTagsToText(finHtml, ignoreNavFooter);
const initText = await Promise.resolve(initTextResult);
const finText = await Promise.resolve(finTextResult);
const initTextLength = initText.length;
const finTextLength = finText.length;
const textRetention = finTextLength > 0 ? initTextLength / finTextLength : 0;
const wordDiff = generateDiffReport(initText, finText, 'word');
const lineDiff = generateDiffReport(initText, finText, 'line');
return {
initialText: initText,
finalText: finText,
initialTextLength: initTextLength,
finalTextLength: finTextLength,
textRetention,
textRetentionPercent: pct(textRetention),
wordDiff,
lineDiff,
initialTextHash: hashDJB2(initText),
finalTextHash: hashDJB2(finText),
};
}
/**
* Calculate basic stats from HTML comparison
* @param {string} originalHTML - Initial HTML content
* @param {string} currentHTML - Final HTML content
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
* @returns {Promise<Object>} Basic statistics
*/
export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter = true) {
// Handle both sync (browser) and async (Node.js) stripTagsToText
const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter);
const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter);
const originalText = await Promise.resolve(originalTextResult);
const currentText = await Promise.resolve(currentTextResult);
// Calculate word counts using consistent tokenization
const originalTokens = tokenize(originalText, 'word');
const currentTokens = tokenize(currentText, 'word');
const wordCountBefore = originalTokens.length;
const wordCountAfter = currentTokens.length;
const wordDiff = Math.abs(wordCountAfter - wordCountBefore);
// Calculate content increase ratio (how many times content increased)
let contentIncreaseRatio;
if (originalTokens.length > 0) {
contentIncreaseRatio = currentTokens.length / originalTokens.length;
} else {
contentIncreaseRatio = currentTokens.length > 0 ? currentTokens.length : 1;
}
// Calculate citation readability (percentage of original content available in current)
const citationReadability = currentTokens.length > 0
? Math.min(100, (originalTokens.length / currentTokens.length) * 100) : 100;
return {
wordCountBefore,
wordCountAfter,
wordDiff,
contentIncreaseRatio: Math.round(contentIncreaseRatio * 100) / 100, // Round to 1 decimal place
citationReadability: Math.round(citationReadability),
};
}
/**
* Calculate stats for both nav/footer scenarios
* @param {string} originalHTML - Initial HTML content
* @param {string} currentHTML - Final HTML content
* @returns {Promise<Object>} Analysis results for both scenarios
*/
export async function calculateBothScenarioStats(originalHTML, currentHTML) {
// Calculate stats with nav/footer ignored
const statsIgnored = await calculateStats(originalHTML, currentHTML, true);
// Calculate stats without nav/footer ignored
const statsNotIgnored = await calculateStats(originalHTML, currentHTML, false);
return {
withNavFooterIgnored: {
wordCountBefore: statsIgnored.wordCountBefore,
wordCountAfter: statsIgnored.wordCountAfter,
wordDiff: statsIgnored.wordDiff,
contentIncreaseRatio: statsIgnored.contentIncreaseRatio,
citationReadability: statsIgnored.citationReadability,
contentGain: `${Math.round(statsIgnored.contentIncreaseRatio * 10) / 10}x`,
missingWords: statsIgnored.wordDiff,
},
withoutNavFooterIgnored: {
wordCountBefore: statsNotIgnored.wordCountBefore,
wordCountAfter: statsNotIgnored.wordCountAfter,
wordDiff: statsNotIgnored.wordDiff,
contentIncreaseRatio: statsNotIgnored.contentIncreaseRatio,
citationReadability: statsNotIgnored.citationReadability,
contentGain: `${Math.round(statsNotIgnored.contentIncreaseRatio * 10) / 10}x`,
missingWords: statsNotIgnored.wordDiff,
},
};
}