openai-code
Version:
An unofficial proxy layer that lets you use Anthropic Claude Code with any OpenAI API backend.
233 lines (219 loc) • 8.06 kB
JavaScript
import { storeFileName } from './vectordb.mjs'
import { statSync, readFileSync } from 'node:fs'
export const nonEmbeddableFileEndings = [
'.lock', // lock files (e.g., package-lock.json, yarn.lock)
'.exe', // executable files
'.dll', // dynamic link library files
'.bin', // binary files
'.env', // environment files
'.zip', // zip files
'.tar', // tar files
'.gz', // gzip files
'.bz2', // bzip2 files
'.xz', // XZ files
'.7z', // 7-Zip files
'.rar', // RAR files
'.iso', // disk image files
'.img', // disk image files
'.sys', // system files
'.tmp', // temporary files
'.log', // log files
'.dat', // data files
'.db', // database files
'.bak', // backup files
'.swp', // swap files
'.swo', // swap files
'.class', // Java class files
'.o', // object files
'.a', // archive files
'.so', // shared object files
'.dylib', // dynamic library files on macOS
'.lib', // library files
'.pdb', // program database files
'.msi', // Windows installer package
'.cab', // cabinet files
'.vmdk', // virtual machine disk files
'.vdi', // virtual disk image files
'.vhd', // virtual hard disk files
'.vhdx', // virtual hard disk files
'.qcow2', // QEMU Copy-On-Write version 2 files
'.dmg', // macOS disk image files
'.pkg', // package files
'.deb', // Debian package files
'.rpm', // Red Hat package manager files
'.tar', // tarball files
'.gz', // gzip compressed files
'.zip', // zip compressed files
'.rar', // RAR compressed files
'.7z', // 7-Zip compressed files
'.xz', // XZ compressed files
'.bz2', // Bzip2 compressed files
'.z', // Z compressed files
'.tgz', // tarball gzip compressed files
'.tbz2', // tarball bzip2 compressed files
'.txz', // tarball XZ compressed files
'.lz', // Lzip compressed files
'.lzma', // LZMA compressed files
'.lzo', // LZO compressed files
'.war', // Web application archive files
'.ear', // Enterprise application archive files
'.jar', // Java archive files
'.apk', // Android package files
'.ipa', // iOS application archive files
'.app', // macOS application package
'.pyc', // Python compiled files
'.pyo', // Python optimized files
'.pyd', // Python dynamic module files
'.rbc', // Ruby compiled files
'.rbo', // Ruby optimized files
'.beam', // Erlang compiled files
'.elc', // Emacs Lisp compiled files
'.scpt', // AppleScript compiled files
'.scptd', // AppleScript compiled script bundle
'.exe', // Windows executable files
'.com', // DOS command files
'.scr', // Windows screen saver files
'.cpl', // Windows control panel files
'.msc', // Microsoft Management Console files
'.gadget',// Windows gadget files
'.svg', // Scalable Vector Graphics files
'package-lock.json', // lock files (e.g., package-lock.json, yarn.lock)
'.map', // source map files
'.pem', // certificate files
'.key', // key files
'.csr', // certificate signing request files
'.crt', // certificate files
'.cer', // certificate files
'.pfx', // PKCS #12 files
'.p12', // PKCS #12 files
'.p7b', // PKCS #7 files
'.p7r', // PKCS #7 files
'.jks', // Java KeyStore files
'.keystore', // Java KeyStore files
'.der', // DER encoded files
'.crl', // certificate revocation list files
'.sst', // Microsoft Serialized Certificate Store files
'.csr', // certificate signing request files
'.rsa', // RSA key files
'.dsa', // DSA key files
'.ec', // EC key files
'.pub', // public key files
'.sig', // signature files
'.pgp', // PGP files
'.asc', // ASCII armored files
'.bak', // backup files
'.old', // old files
'.orig', // original files
'.min.js',// minified JavaScript files
'.min.css',// minified CSS files
'.npmignore', // npm ignore files
'.gitignore', // Git ignore files
'.dockerignore', // Docker ignore files
'.eslintignore', // ESLint ignore files
'.prettierignore', // Prettier ignore files
'.DS_Store', // macOS directory attribute files
'Thumbs.db', // Windows thumbnail cache files
'CLAUDE.md', // CLAUDE metadata files
'CLAUDE_RULES.md', // CLAUDE rules files
'CLAUDE_STATE.json', // CLAUDE state files
storeFileName, // CLAUDE vector database files itself
];
export const skipDirectoryNames = [
'node_modules', // Node.js modules
'data', // Data files
'.git', // Git repository
'.claudeignore',// CLAUDE ignore files
'.svn', // Subversion repository
'.ssh', // SSH keys
'.cert', // Certificate files
'.certs', // Certificate files
'.hg', // Mercurial repository
'.vscode', // Visual Studio Code settings
'.idea', // IntelliJ IDEA settings
'.vs', // Visual Studio settings
'.github', // GitHub settings
'.gitlab', // GitLab settings
'.circleci', // CircleCI settings
'.travis', // Travis CI settings
'.appveyor', // AppVeyor settings
'.docker', // Docker settings
'.vagrant', // Vagrant settings
'.terraform', // Terraform settings
'.vscode-test', // Visual Studio Code test settings
'build', // Build output directory
'dist', // Distribution output directory
'venv', // Python virtual environment
'target', // Rust and Java build output
'bin', // C/C++ binary output
'obj', // C/C++ object files
'logs', // Log files
'tmp', // Temporary files
'temp', // Temporary files
'cache', // Cache files
'out', // Output files
'coverage', // Code coverage reports
]
export function isPrintable(ch) {
const code = ch.charCodeAt(0);
// Allow alphanumerics and common whitespace
if (
(code >= 48 && code <= 57) || // 0-9
(code >= 65 && code <= 90) || // A-Z
(code >= 97 && code <= 122) || // a-z
ch === ' ' ||
ch === '\t' ||
ch === '\n' ||
ch === '\r'
) {
return true;
}
// Define a palette of punctuation/control characters common in programming languages
const programmingChars = new Set([
',', '.', ';', ':', '!', '?', '$', '(', ')', '{', '}', '[', ']', '<', '>', '=', '+',
'-', '*', '/', '%', '&', '|', '^', '~', '`', '"', "'", '\\', '@', '#'
]);
if (programmingChars.has(ch)) {
return true;
}
// Also allow any character within the basic ASCII printable range (for completeness)
if (code >= 0x20 && code <= 0x7E) {
return true;
}
// Include common Unicode blocks for CJK characters
if (code >= 0x4E00 && code <= 0x9FFF) return true; // CJK Unified Ideographs
if (code >= 0x3000 && code <= 0x303F) return true; // CJK Symbols and Punctuation
if (code >= 0x3040 && code <= 0x309F) return true; // Hiragana
if (code >= 0x30A0 && code <= 0x30FF) return true; // Katakana
if (code >= 0xAC00 && code <= 0xD7AF) return true; // Hangul Syllables
return false;
}
export function isEmbeddableTextFile(filePath) {
try {
// Check file size (128 kB limit)
const stats = statSync(filePath);
if (stats.size > 128 * 1024) return false;
// Read the file as a Buffer
const buffer = readFileSync(filePath);
// Quick binary check: if a null byte is found, likely binary
if (buffer.indexOf(0) !== -1) return false;
// Convert Buffer to string (assuming UTF-8)
const text = buffer.toString('utf8');
const lines = text.split(/\r?\n/);
if (lines.length < 2) return false;
// Limit analysis to the first 1024 characters
const analysisLength = Math.min(1024, text.length);
let printableCount = 0;
for (let i = 0; i < analysisLength; i++) {
const ch = text.charAt(i);
if (isPrintable(ch)) {
printableCount++;
}
}
const ratio = printableCount / analysisLength;
if (ratio < 0.9) return false;
return true;
} catch (err) {
console.error('Error reading file:', err);
return false;
}
}