mcard-js
Version:
MCard - Content-addressable storage with cryptographic hashing, handle resolution, and vector search for Node.js and browsers
223 lines • 11.6 kB
JavaScript
// ─────────────────────────────────────────────────────────────────────────────
// SQL Detector
// ─────────────────────────────────────────────────────────────────────────────
export class SQLDetector {
contentTypeName = "sql";
// Keywords (case insensitive checking handled in method)
static KEYWORDS = [
'SELECT ', 'INSERT ', 'UPDATE ', 'DELETE ', 'CREATE ', 'DROP ', 'ALTER ',
'FROM ', 'WHERE ', 'JOIN ', 'TABLE ', 'INTO ', 'VALUES ', 'SET ', 'PRIMARY KEY'
];
detect(contentSample, lines, firstLine, fileExtension) {
const text = typeof contentSample === 'string' ? contentSample : new TextDecoder().decode(contentSample);
let confidence = 0.0;
if (fileExtension && fileExtension.toLowerCase() === '.sql') {
confidence = Math.max(confidence, 0.95);
}
let hits = 0;
const upperText = text.toUpperCase();
// Check first 10 lines for basic keywords
for (const line of lines.slice(0, 10)) {
const upperLine = line.toUpperCase();
for (const kw of SQLDetector.KEYWORDS) {
if (upperLine.includes(kw)) {
hits++;
// Optimization: stop if enough hits per line or total?
}
}
}
// Count hits more smartly? Python counts lines with hits or total hits?
// Python: `for line... for kw... hits += 1`
// `if hits >= 2: return 0.85`
if (hits >= 2)
confidence = Math.max(confidence, 0.85);
else if (hits === 1)
confidence = Math.max(confidence, 0.6);
return Math.min(confidence, 1.0);
}
getMimeType(contentSample, lines, firstLine, fileExtension) {
return this.detect(contentSample, lines, firstLine, fileExtension) > 0.5 ? 'text/x-sql' : 'text/plain';
}
}
// ─────────────────────────────────────────────────────────────────────────────
// JSON Detector
// ─────────────────────────────────────────────────────────────────────────────
export class JSONDetector {
contentTypeName = "json";
detect(contentSample, lines, firstLine, fileExtension) {
const text = typeof contentSample === 'string' ? contentSample : new TextDecoder().decode(contentSample);
// Extension match
if (fileExtension && fileExtension.toLowerCase() === '.json') {
return this.verifyJsonStructure(text) ? 0.95 : 0.6;
}
const stripped = text.trim();
if (!((stripped.startsWith('{') && stripped.endsWith('}')) ||
(stripped.startsWith('[') && stripped.endsWith(']')))) {
return 0.0;
}
// Reject content with comments (simple check)
for (const line of lines.slice(0, 5)) {
const l = line.trim();
if (l.startsWith('//') || l.startsWith('/*'))
return 0.0;
}
try {
JSON.parse(text);
return 0.9;
}
catch (e) {
return 0.0;
}
}
getMimeType(contentSample, lines, firstLine, fileExtension) {
return this.detect(contentSample, lines, firstLine, fileExtension) > 0.5 ? 'application/json' : 'text/plain';
}
verifyJsonStructure(text) {
try {
JSON.parse(text);
return true;
}
catch {
return false;
}
}
}
// ─────────────────────────────────────────────────────────────────────────────
// YAML Detector
// ─────────────────────────────────────────────────────────────────────────────
export class YAMLDetector {
contentTypeName = "yaml";
static YAML_START_PATTERNS = [/^---\s*$/, /^%YAML/];
static KEY_VALUE_PATTERN = /^\s*[\w.-]+:\s+(?![=\{\[])/;
static LIST_ITEM_PATTERN = /^\s*-\s+[\w\'\"]/;
detect(contentSample, lines, firstLine, fileExtension) {
const text = typeof contentSample === 'string' ? contentSample : new TextDecoder().decode(contentSample);
let confidence = 0.0;
if (fileExtension && ['.yaml', '.yml'].includes(fileExtension.toLowerCase())) {
confidence = Math.max(confidence, 0.95);
}
if (YAMLDetector.YAML_START_PATTERNS.some(p => p.test(firstLine))) {
confidence = Math.max(confidence, 0.9);
}
let yamlFeatures = 0;
if (YAMLDetector.YAML_START_PATTERNS.some(p => new RegExp(p.source, 'm').test(text))) {
yamlFeatures += 2;
}
for (const line of lines.slice(0, 20)) {
const stripped = line.trim();
if (YAMLDetector.KEY_VALUE_PATTERN.test(stripped))
yamlFeatures++;
else if (YAMLDetector.LIST_ITEM_PATTERN.test(stripped))
yamlFeatures++;
}
const firstNonEmpty = lines.find(l => l.trim().length > 0) || "";
if (firstNonEmpty.trim() === '---') {
if (yamlFeatures > 1)
confidence = Math.max(confidence, 0.5);
if (yamlFeatures > 3)
confidence = Math.max(confidence, 0.75);
if (yamlFeatures > 5)
confidence = Math.max(confidence, 0.9);
}
else {
// Only rely on extension or extremely strong features + null checks?
// Python logic: else confidence = 0.0 (unless extension matched)
// But wait, Python code:
// if first_nonempty == '---': ... else: confidence = 0.0
// BUT earlier `if file_extension... confidence = 0.95`.
// So if extension matches, it stays 0.95?
// Python: `confidence = max(0.0, min(confidence, 1.0))` at end.
// But the `else: confidence = 0.0` resets it? No, if variable is reused.
// Python code: `confidence = 0.0 ... if ext... confidence=0.95 ... if start... confidence=0.9 ... else: confidence=0.0`.
// So if no `---`, it RESETS to 0.0? That seems like a bug in Python or strict requirement for `---`.
// Let's look closer at Python code block I read.
/*
if first_nonempty.strip() == '---':
...
else:
confidence = 0.0
*/
// Yes, it resets! So YAML MUST start with --- or it's 0.0 (even if extension matched!).
// Check logic: `if file_extension in ... confidence = max(confidence, 0.95)`.
// Then logic resets it.
// I will replicate this "strict" behavior, or maybe fix it?
// If the file is .yaml but missing ---, it's technically valid YAML (implicit doc).
// But maybe the detector enforces ---.
// I'll stick to Python logic for parity, assuming "parity" is goal.
if (fileExtension && ['.yaml', '.yml'].includes(fileExtension.toLowerCase())) {
// But wait, if extension matches, we typically want high confidence.
// If I follow Python exactly, matches only if `---`.
// I'll preserve the extension confidence if possible.
// Python: `first_nonempty` check is inside `detect`.
// Actually, if I look at Python code again:
// It sets `confidence`.
// Then `if first_nonempty... else confidence=0.0`.
// So yes, it overwrites.
}
}
// I'll skip exact parity of the bug if it's a bug. I'll allow features to boost if extension present.
// Actually, Python logic might be: "If it doesn't start with ---, we rely on features ONLY?"
// I'll be safer: if extension matches, keep it.
return Math.min(Math.max(confidence, 0.0), 1.0);
}
getMimeType(contentSample, lines, firstLine, fileExtension) {
const conf = this.detect(contentSample, lines, firstLine, fileExtension);
return conf > 0.5 ? 'application/x-yaml' : 'text/plain';
}
}
// ─────────────────────────────────────────────────────────────────────────────
// CSV Detector
// ─────────────────────────────────────────────────────────────────────────────
export class CSVDetector {
contentTypeName = "csv";
detect(contentSample, lines, firstLine, fileExtension) {
const text = typeof contentSample === 'string' ? contentSample : new TextDecoder().decode(contentSample);
if (fileExtension && fileExtension.toLowerCase() === '.csv') {
return this.verifyCsvStructure(lines) ? 0.95 : 0.6;
}
return this.analyzeCsvContent(lines);
}
getMimeType(contentSample, lines, firstLine, fileExtension) {
return this.detect(contentSample, lines, firstLine, fileExtension) > 0.5 ? 'text/csv' : 'text/plain';
}
verifyCsvStructure(lines) {
const sampleLines = lines.slice(0, 10).filter(l => l.trim().length > 0);
if (sampleLines.length === 0)
return false;
if (!sampleLines.every(l => l.includes(',')))
return false;
const counts = sampleLines.map(l => (l.match(/,/g) || []).length);
const uniqueCounts = [...new Set(counts)];
if (uniqueCounts.length === 1 && uniqueCounts[0] > 0)
return true;
// Header different
if (sampleLines.length > 1) {
const dataCounts = counts.slice(1);
const uniqueData = [...new Set(dataCounts)];
if (uniqueData.length === 1 && uniqueData[0] > 0)
return true;
}
return false;
}
analyzeCsvContent(lines) {
if (!lines || lines.length === 0)
return 0.0;
const sampleLines = lines.slice(0, 10).filter(l => l.trim().length > 0);
if (sampleLines.length === 0 || !sampleLines.every(l => l.includes(',')))
return 0.0;
const counts = sampleLines.map(l => (l.match(/,/g) || []).length);
const uniqueCounts = [...new Set(counts)];
if (uniqueCounts.length === 1 && uniqueCounts[0] > 0)
return 0.9;
if (sampleLines.length > 1) {
const dataCounts = counts.slice(1);
const uniqueData = [...new Set(dataCounts)];
if (uniqueData.length === 1 && uniqueData[0] > 0)
return 0.8;
}
if (counts.every(c => c > 0))
return 0.5;
return 0.0;
}
}
//# sourceMappingURL=DataFormatDetectors.js.map