article-writer-cn
Version:
AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作
130 lines • 3.08 kB
JavaScript
/**
* 爬虫工具函数
*/
/**
* 检测是否为文档网站
*/
export function isDocumentationSite(url) {
const patterns = [
'docs.',
'/docs/',
'/guide/',
'/api/',
'/documentation/',
'/reference/',
'readthedocs.io',
'gitbook.io',
'/manual/',
'/tutorial/'
];
const lowerUrl = url.toLowerCase();
return patterns.some(pattern => lowerUrl.includes(pattern));
}
/**
* 清理文件名
*/
export function sanitizeFilename(filename) {
return filename
.replace(/[<>:"/\\|?*]/g, '-')
.replace(/\s+/g, '-')
.replace(/-+/g, '-')
.replace(/^-+|-+$/g, '')
.substring(0, 200);
}
/**
* 预估爬取时间
*/
export function estimateDuration(pageCount, rateLimit = 500, concurrency = 5) {
// 单页平均时间 = 请求时间 + 速率限制
const avgTimePerPage = 1000 + rateLimit; // 毫秒
// 总时间 = (总页数 / 并发数) * 单页时间
const totalTime = (pageCount / concurrency) * avgTimePerPage;
return totalTime;
}
/**
* 格式化字节大小
*/
export function formatBytes(bytes) {
if (bytes === 0)
return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return `${(bytes / Math.pow(k, i)).toFixed(2)} ${sizes[i]}`;
}
/**
* 延迟函数
*/
export function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* 重试函数
*/
export async function retry(fn, retries = 3, delay = 1000) {
try {
return await fn();
}
catch (error) {
if (retries === 0)
throw error;
console.log(`⚠️ 失败,${delay}ms 后重试... (剩余 ${retries} 次)`);
await sleep(delay);
return retry(fn, retries - 1, delay * 2);
}
}
/**
* 检查 URL 是否有效
*/
export function isValidUrl(url) {
try {
new URL(url);
return true;
}
catch {
return false;
}
}
/**
* 获取域名
*/
export function getDomain(url) {
try {
const parsed = new URL(url);
return parsed.hostname;
}
catch {
return '';
}
}
/**
* 规范化 URL(移除 hash 和某些 query 参数)
*/
export function normalizeUrl(url) {
try {
const parsed = new URL(url);
parsed.hash = '';
// 移除常见的追踪参数
const trackingParams = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'source'];
trackingParams.forEach(param => {
parsed.searchParams.delete(param);
});
return parsed.href;
}
catch {
return url;
}
}
/**
* 检测文本语言
*/
export function detectLanguage(text) {
const chineseChars = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
const englishChars = (text.match(/[a-zA-Z]/g) || []).length;
if (chineseChars > englishChars)
return 'zh';
if (englishChars > chineseChars)
return 'en';
return 'unknown';
}
//# sourceMappingURL=utils.js.map