html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
200 lines (199 loc) • 7.55 kB
JavaScript
;
/**
* 百度首页和搜索引擎页面专用过滤器插件
* 专门处理搜索引擎首页的大量样式代码、搜索建议等噪音内容
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.createBaiduConfig = exports.baiduFilterPlugin = void 0;
/**
* 百度页面特有元素选择器
*/
const BAIDU_SPECIFIC_SELECTORS = [
// 搜索建议相关
'.bdsug', '.bdsug-new', '.sam_search', '.sam_search_rec',
'#head_wrapper .bdsug', '#main-wrapper .bdsug-new',
// 百度特有UI元素
'#prefpanel', '#mMenu', '#s_lm_wrap', '#head_wrapper',
'.bdpfmenu', '.usermenu', '.briiconsbg',
// 隐藏和广告元素
'[style*="display:none"]', '[style*="visibility:hidden"]',
'[class*="ad"]', '[id*="ad"]', '.bd_bear_home',
// 百度特有样式类
'.s-ps-sug', '.s-ps-islite', '.s-skin-hasbg',
'.new_input_superman', '.wrapper_new'
];
/**
* 应该保留的有价值元素
*/
const KEEP_VALUABLE_SELECTORS = [
// 导航链接
'#nv a', '#lk a',
// 页面标题和描述
'meta[name="description"]',
// 主要内容区域
'#wrapper', '#s_wrap',
// 有效链接
'a[href^="http"]'
];
exports.baiduFilterPlugin = {
name: 'baidu-filter',
description: '百度首页和搜索引擎页面专用过滤器',
apply(context) {
const { document, options, logger } = context;
let removedCount = 0;
let preservedCount = 0;
logger === null || logger === void 0 ? void 0 : logger.info('🔍 应用百度专用过滤器...');
// 1. 移除百度特有的噪音元素
BAIDU_SPECIFIC_SELECTORS.forEach(selector => {
try {
const elements = document.querySelectorAll(selector);
elements.forEach((element) => {
// 检查是否包含有价值的内容
if (!hasValuableContent(element)) {
element.remove();
removedCount++;
}
});
}
catch (error) {
logger === null || logger === void 0 ? void 0 : logger.debug(`选择器 ${selector} 处理失败:`, error);
}
});
// 2. 移除大段的样式代码
const styleElements = document.querySelectorAll('style');
styleElements.forEach((style) => {
if (style.textContent && style.textContent.length > 1000) {
style.remove();
removedCount++;
}
});
// 3. 清理内联样式过多的元素
const elementsWithStyle = document.querySelectorAll('[style]');
elementsWithStyle.forEach((element) => {
const styleAttr = element.getAttribute('style');
if (styleAttr && styleAttr.length > 100) {
// 保留基本的显示属性,移除复杂样式
const basicStyle = styleAttr.match(/(display|visibility):[^;]+/g);
if (basicStyle) {
element.setAttribute('style', basicStyle.join(';'));
}
else {
element.removeAttribute('style');
}
}
});
// 4. 保留有价值的导航链接
KEEP_VALUABLE_SELECTORS.forEach(selector => {
try {
const elements = document.querySelectorAll(selector);
elements.forEach((element) => {
// 标记为重要内容,避免被其他过滤器移除
element.setAttribute('data-keep', 'true');
preservedCount++;
});
}
catch (error) {
logger === null || logger === void 0 ? void 0 : logger.debug(`保留选择器 ${selector} 处理失败:`, error);
}
});
// 5. 处理百度搜索框区域
processBaiduSearchForm(document);
// 6. 清理空的容器元素
removeEmptyContainers(document);
logger === null || logger === void 0 ? void 0 : logger.info(`✅ 百度过滤器完成: 移除了 ${removedCount} 个元素,保留了 ${preservedCount} 个有价值元素`);
}
};
/**
* 检查元素是否包含有价值的内容
*/
function hasValuableContent(element) {
var _a;
// 检查是否包含有效链接
const links = element.querySelectorAll('a[href]');
if (links.length > 0) {
return Array.from(links).some(link => {
const href = link.getAttribute('href');
return href && (href.startsWith('http') || href.startsWith('/'));
});
}
// 检查文本内容质量
const textContent = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '';
if (textContent.length > 10 && textContent.length < 200) {
// 排除样式代码
if (!textContent.includes('{') && !textContent.includes('px')
&& !textContent.includes('color:')) {
return true;
}
}
return false;
}
/**
* 处理百度搜索表单区域
*/
function processBaiduSearchForm(document) {
var _a;
const form = document.querySelector('#form');
if (form) {
// 保留搜索框本身,移除复杂的建议逻辑
const input = form.querySelector('input[name="wd"]');
const button = form.querySelector('input[type="submit"]');
if (input || button) {
// 简化表单,只保留基本搜索功能
const simpleForm = document.createElement('div');
simpleForm.textContent = '搜索框区域';
(_a = form.parentNode) === null || _a === void 0 ? void 0 : _a.replaceChild(simpleForm, form);
}
}
}
/**
* 移除空的容器元素
*/
function removeEmptyContainers(document) {
const containers = ['div', 'span', 'section', 'article', 'aside'];
containers.forEach(tag => {
const elements = document.querySelectorAll(tag);
elements.forEach((element) => {
var _a;
const textContent = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '';
const hasChildren = element.children.length > 0;
const hasKeepMark = element.hasAttribute('data-keep');
// 如果是空容器且没有标记为保留,则移除
if (!textContent && !hasChildren && !hasKeepMark) {
element.remove();
}
});
});
}
/**
* 创建百度首页专用的处理器配置
*/
function createBaiduConfig() {
return {
filter: {
threshold: 15,
strategy: 'dynamic',
ratio: 0.7,
removeElements: [
'style[index]',
'script',
'noscript',
'meta[http-equiv]',
'link[rel="dns-prefetch"]',
'link[rel="stylesheet"]'
],
keepElements: [
'a[href]',
'h1', 'h2', 'h3',
'div[data-keep]'
],
plugins: [exports.baiduFilterPlugin]
},
converter: {
ignoreImages: true,
ignoreLinks: false,
citations: false,
format: 'github'
}
};
}
exports.createBaiduConfig = createBaiduConfig;