article-writer-cn
Version:
AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作
171 lines (144 loc) • 4.07 kB
text/typescript
/**
* PDF 文档提取器
*/
import fs from 'fs-extra';
import path from 'path';
import type { PDFExtractionOptions, PDFExtractionResult, CrawledPage } from './types.js';
// 动态导入 pdf-parse(可选依赖)
async function loadPdfParse() {
try {
const pdfParse = await import('pdf-parse');
return pdfParse.default || pdfParse;
} catch (err) {
throw new Error(
'❌ pdf-parse 未安装。请运行: npm install pdf-parse\n' +
' 如果不需要 PDF 功能,可以使用网页爬虫(移除 --pdf 参数)'
);
}
}
export class PDFExtractor {
/**
* 从 PDF 文件提取内容
*/
async extractFromPDF(
pdfPath: string,
options: PDFExtractionOptions = {}
): Promise<PDFExtractionResult> {
console.log(`📄 开始提取 PDF: ${pdfPath}`);
// 动态加载 pdf-parse
const pdfParse = await loadPdfParse();
// 读取 PDF 文件
const dataBuffer = await fs.readFile(pdfPath);
// 基础提取
const data = await pdfParse(dataBuffer);
const result: PDFExtractionResult = {
text: data.text,
pages: data.numpages,
metadata: data.metadata || {}
};
console.log(`✅ PDF 提取完成: ${result.pages} 页`);
return result;
}
/**
* 将 PDF 提取结果转换为 CrawledPage 格式
*/
convertToCrawledPage(
pdfPath: string,
extractionResult: PDFExtractionResult,
name: string
): CrawledPage {
const title = extractionResult.metadata?.Title ||
path.basename(pdfPath, '.pdf');
// 分割文本为段落
const paragraphs = extractionResult.text
.split('\n\n')
.filter(p => p.trim().length > 0);
// 转换为 Markdown
const content = paragraphs
.map(p => p.trim().replace(/\n/g, ' '))
.join('\n\n');
// 提取代码块(简单启发式)
const codeExamples = this.extractCodeFromText(extractionResult.text);
return {
url: `file://${pdfPath}`,
title,
content,
htmlContent: `<pre>${extractionResult.text}</pre>`,
codeExamples,
category: 'pdf-document',
scrapedAt: new Date().toISOString(),
links: []
};
}
/**
* 从文本中提取代码块
*/
private extractCodeFromText(text: string): Array<{
language: string;
content: string;
lineCount: number;
}> {
const codeExamples: Array<{
language: string;
content: string;
lineCount: number;
}> = [];
// 检测代码块模式
const codePatterns = [
// 常见的代码标记
/```(\w+)?\n([\s\S]+?)```/g,
// 缩进的代码块
/((?:^ .+$\n?)+)/gm
];
for (const pattern of codePatterns) {
let match;
while ((match = pattern.exec(text)) !== null) {
const language = match[1] || 'text';
const content = (match[2] || match[1] || '').trim();
if (content.length > 10 && content.length < 5000) {
codeExamples.push({
language,
content,
lineCount: content.split('\n').length
});
}
}
}
return codeExamples.slice(0, 20);
}
/**
* 保存提取结果
*/
async saveExtraction(
outputDir: string,
name: string,
page: CrawledPage
): Promise<void> {
await fs.ensureDir(outputDir);
// 保存为 JSON
const pagesDir = path.join(outputDir, 'pages');
await fs.ensureDir(pagesDir);
await fs.writeJSON(
path.join(pagesDir, '001-pdf-content.json'),
page,
{ spaces: 2 }
);
// 保存摘要
const summary = {
name,
baseUrl: page.url,
totalPages: 1,
categories: { 'pdf-document': 1 },
crawledAt: page.scrapedAt
};
await fs.writeJSON(
path.join(outputDir, 'summary.json'),
summary,
{ spaces: 2 }
);
// 保存为 Markdown
const mdPath = path.join(outputDir, `${name}.md`);
await fs.writeFile(mdPath, `# ${page.title}\n\n${page.content}`);
console.log(`💾 PDF 提取结果已保存: ${outputDir}`);
}
}