UNPKG

polish-cli

Version:

AI-powered file organization for Obsidian with automatic markdown conversion

395 lines 17.4 kB
import * as fs from 'fs/promises'; import * as path from 'path'; import * as os from 'os'; import { FileType } from '../types/index.js'; import { MarkdownGenerator } from './MarkdownGenerator.js'; import { ContentExtractor } from './ContentExtractor.js'; import { sanitizeFilename, getDatePath } from '../utils/formatting.js'; export class FileProcessor { config; claudeService; markdownGenerator; contentExtractor; constructor(config, claudeService) { this.config = config; this.claudeService = claudeService; this.markdownGenerator = new MarkdownGenerator(); this.contentExtractor = new ContentExtractor(); } async processFiles(files, options) { const results = { processed: [], failed: [], summary: { total: files.length, successful: 0, failed: 0, duration: 0, }, }; const startTime = Date.now(); for (let i = 0; i < files.length; i++) { const file = files[i]; if (options.onProgress) { options.onProgress(i + 1, files.length, file); } try { const processed = await this.processFile(file, options); results.processed.push(processed); results.summary.successful++; } catch (error) { results.failed.push({ file, error: error instanceof Error ? error.message : 'Unknown error', }); results.summary.failed++; } } results.summary.duration = Date.now() - startTime; return results; } async processFile(file, options) { // Handle archives with recursive processing if (file.type === FileType.Archive) { return await this.processArchiveRecursively(file, options); } // Extract content const content = await this.contentExtractor.extract(file); // Get tag suggestions const tagSuggestions = await this.claudeService.suggestTags(file, content || undefined); const tags = tagSuggestions .sort((a, b) => b.confidence - a.confidence) .slice(0, this.config.tagging.maxTags) .map(t => t.tag); // Get category suggestion const existingFolders = await this.getExistingVaultFolders(); const categorySuggestion = await this.claudeService.suggestCategory(file, existingFolders); // Determine paths const vaultCategory = this.mapCategoryToVaultFolder(categorySuggestion.category); const markdownPath = path.join(this.config.vault.path, vaultCategory, sanitizeFilename(path.parse(file.name).name + '.md')); const originalNewPath = this.getOriginalFilePath(file, categorySuggestion.category); // Generate markdown content const frontmatter = { title: path.parse(file.name).name, originalFile: `[[file://${originalNewPath}]]`, sourceLocation: file.path, // Keep reference to source for debugging fileType: file.extension, created: file.createdAt.toISOString(), processed: new Date().toISOString(), tags, }; const markdownContent = this.markdownGenerator.generate(file, content, frontmatter); // Execute file operations (unless dry run) if (!options.dryRun) { await this.ensureDirectory(path.dirname(markdownPath)); await this.ensureDirectory(path.dirname(originalNewPath)); await fs.writeFile(markdownPath, markdownContent); if (options.copy) { await fs.copyFile(file.path, originalNewPath); } else { await fs.rename(file.path, originalNewPath); } } return { original: file, markdownPath, originalNewPath, content: markdownContent, frontmatter, tags, category: categorySuggestion.category, }; } async getExistingVaultFolders() { try { const entries = await fs.readdir(this.config.vault.path, { withFileTypes: true }); return entries .filter(entry => entry.isDirectory() && !entry.name.startsWith('.')) .map(entry => entry.name); } catch { return []; } } mapCategoryToVaultFolder(category) { const typeMap = { document: this.config.vault.structure.documents, image: this.config.vault.structure.media, code: this.config.vault.structure.code, media: this.config.vault.structure.media, }; return typeMap[category.toLowerCase()] || this.config.vault.structure.references; } getOriginalFilePath(file, category) { let basePath = this.config.originals.path; if (this.config.originals.createYearFolders) { basePath = path.join(basePath, file.modifiedAt.getFullYear().toString()); } if (this.config.originals.organizationStyle === 'type-based') { basePath = path.join(basePath, category); } else if (this.config.originals.organizationStyle === 'date-based') { basePath = path.join(basePath, getDatePath()); } return path.join(basePath, file.name); } async ensureDirectory(dirPath) { await fs.mkdir(dirPath, { recursive: true }); } async processArchiveRecursively(file, options) { const tempExtractDir = await fs.mkdtemp(path.join(os.tmpdir(), 'polish-archive-')); try { console.log(`📦 Expanding archive: ${file.name}`); // Extract archive contents const extractedFiles = await this.extractArchive(file, tempExtractDir); if (extractedFiles.length === 0) { console.log(`⚠️ Archive ${file.name} appears to be empty or could not be extracted`); return await this.processFileAsNormal(file, options); } console.log(`📁 Found ${extractedFiles.length} files in archive`); // Process each extracted file recursively const childResults = []; for (const extractedFile of extractedFiles) { try { // Create FileInfo for the extracted file const stat = await fs.stat(extractedFile); const fileInfo = { path: extractedFile, name: path.basename(extractedFile), extension: path.extname(extractedFile).slice(1).toLowerCase(), size: stat.size, createdAt: stat.birthtime, modifiedAt: stat.mtime, type: this.determineFileType(path.extname(extractedFile).slice(1).toLowerCase()), }; // Recursively process the extracted file const processed = await this.processFile(fileInfo, options); childResults.push(processed); } catch (error) { console.warn(`Failed to process extracted file ${extractedFile}:`, error); } } // Create summary markdown for the archive const archiveContent = this.generateArchiveSummary(file, childResults); const tags = await this.generateArchiveTags(file, childResults); // Determine paths for the archive summary const categorySuggestion = await this.claudeService.suggestCategory(file, await this.getExistingVaultFolders()); const vaultCategory = this.mapCategoryToVaultFolder(categorySuggestion.category); const markdownPath = path.join(this.config.vault.path, vaultCategory, sanitizeFilename(path.parse(file.name).name + '_archive.md')); const originalNewPath = this.getOriginalFilePath(file, categorySuggestion.category); // Generate frontmatter for archive summary const frontmatter = { title: `Archive: ${path.parse(file.name).name}`, originalFile: `[[file://${originalNewPath}]]`, sourceLocation: file.path, fileType: file.extension, archiveType: 'expanded', extractedFiles: childResults.length, created: file.createdAt.toISOString(), processed: new Date().toISOString(), tags, }; const markdownContent = this.markdownGenerator.generate(file, archiveContent, frontmatter); // Execute file operations (unless dry run) if (!options.dryRun) { await this.ensureDirectory(path.dirname(markdownPath)); await this.ensureDirectory(path.dirname(originalNewPath)); await fs.writeFile(markdownPath, markdownContent); if (options.copy) { await fs.copyFile(file.path, originalNewPath); } else { await fs.rename(file.path, originalNewPath); } } return { original: file, markdownPath, originalNewPath, content: markdownContent, frontmatter, tags, category: categorySuggestion.category, }; } finally { // Always clean up the temporary extraction directory try { await fs.rm(tempExtractDir, { recursive: true, force: true }); } catch (error) { console.warn(`Failed to clean up temporary directory ${tempExtractDir}:`, error); } } } async extractArchive(file, outputDir) { const extractedFiles = []; try { if (file.extension.toLowerCase() === 'zip') { try { // Dynamically import unzipper to handle cases where it's not installed const unzipperModule = await import('unzipper'); const unzipper = unzipperModule.default || unzipperModule; const archive = await unzipper.Open.file(file.path); for (const entry of archive.files) { if (!entry.type || entry.type === 'File') { // Security check: prevent path traversal const safePath = path.join(outputDir, entry.path.replace(/^\/+/, '').replace(/\.\./g, '')); // Ensure the directory exists await fs.mkdir(path.dirname(safePath), { recursive: true }); // Extract the file const buffer = await entry.buffer(); await fs.writeFile(safePath, buffer); extractedFiles.push(safePath); } } } catch (importError) { console.warn(`Archive extraction requires 'unzipper' package. Install with: npm install unzipper`); console.warn(`Skipping extraction of ${file.name}`); } } else { // For other archive types, we could add support for tar, rar, etc. console.warn(`Archive type ${file.extension} not yet supported for extraction`); } } catch (error) { console.warn(`Failed to extract archive ${file.name}:`, error); } return extractedFiles; } async processFileAsNormal(file, options) { // This is the original processFile logic for non-archives const content = await this.contentExtractor.extract(file); const tagSuggestions = await this.claudeService.suggestTags(file, content || undefined); const tags = tagSuggestions .sort((a, b) => b.confidence - a.confidence) .slice(0, this.config.tagging.maxTags) .map(t => t.tag); const existingFolders = await this.getExistingVaultFolders(); const categorySuggestion = await this.claudeService.suggestCategory(file, existingFolders); const vaultCategory = this.mapCategoryToVaultFolder(categorySuggestion.category); const markdownPath = path.join(this.config.vault.path, vaultCategory, sanitizeFilename(path.parse(file.name).name + '.md')); const originalNewPath = this.getOriginalFilePath(file, categorySuggestion.category); const frontmatter = { title: path.parse(file.name).name, originalFile: `[[file://${originalNewPath}]]`, sourceLocation: file.path, fileType: file.extension, created: file.createdAt.toISOString(), processed: new Date().toISOString(), tags, }; const markdownContent = this.markdownGenerator.generate(file, content, frontmatter); if (!options.dryRun) { await this.ensureDirectory(path.dirname(markdownPath)); await this.ensureDirectory(path.dirname(originalNewPath)); await fs.writeFile(markdownPath, markdownContent); if (options.copy) { await fs.copyFile(file.path, originalNewPath); } else { await fs.rename(file.path, originalNewPath); } } return { original: file, markdownPath, originalNewPath, content: markdownContent, frontmatter, tags, category: categorySuggestion.category, }; } generateArchiveSummary(archiveFile, extractedFiles) { const summary = [ `# Archive: ${archiveFile.name}`, '', `This archive contained ${extractedFiles.length} files that were extracted and processed:`, '', ]; // Group by category const byCategory = extractedFiles.reduce((groups, file) => { const category = file.category || 'unknown'; if (!groups[category]) groups[category] = []; groups[category].push(file); return groups; }, {}); Object.entries(byCategory).forEach(([category, files]) => { summary.push(`## ${category.charAt(0).toUpperCase() + category.slice(1)} Files (${files.length})`); summary.push(''); files.forEach(file => { const linkName = path.parse(file.original.name).name; const relativePath = path.relative(this.config.vault.path, file.markdownPath); summary.push(`- [[${relativePath.replace(/\.md$/, '')}|${linkName}]]`); }); summary.push(''); }); return summary.join('\n'); } async generateArchiveTags(archiveFile, extractedFiles) { const tags = new Set(); // Add archive-specific tags tags.add('type/archive'); tags.add(`format/${archiveFile.extension}`); tags.add('source/expanded'); // Add date tag const year = archiveFile.modifiedAt.getFullYear(); const month = String(archiveFile.modifiedAt.getMonth() + 1).padStart(2, '0'); tags.add(`date/${year}/${month}`); // Aggregate tags from extracted files const childTags = new Set(); extractedFiles.forEach(file => { file.tags.forEach(tag => { if (!tag.startsWith('type/') && !tag.startsWith('format/')) { childTags.add(tag); } }); }); // Add most common child tags const tagCounts = Array.from(childTags).map(tag => ({ tag, count: extractedFiles.filter(file => file.tags.includes(tag)).length })); tagCounts .sort((a, b) => b.count - a.count) .slice(0, 5) .forEach(({ tag }) => tags.add(`contains/${tag.split('/').pop()}`)); return Array.from(tags).slice(0, this.config.tagging.maxTags); } determineFileType(extension) { const ext = extension.toLowerCase(); // Documents if (['pdf', 'doc', 'docx', 'txt', 'md', 'rtf', 'odt'].includes(ext)) { return FileType.Document; } // Images if (['jpg', 'jpeg', 'png', 'gif', 'bmp', 'svg', 'webp'].includes(ext)) { return FileType.Image; } // Code if (['js', 'ts', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'rb', 'php'].includes(ext)) { return FileType.Code; } // Data if (['json', 'csv', 'xml', 'yaml', 'yml'].includes(ext)) { return FileType.Data; } // Media if (['mp3', 'wav', 'mp4', 'avi', 'mov', 'mkv'].includes(ext)) { return FileType.Media; } // Archives if (['zip', 'rar', 'tar', 'gz', '7z'].includes(ext)) { return FileType.Archive; } return FileType.Unknown; } } //# sourceMappingURL=FileProcessor.js.map