UNPKG

linguist-js

Version:

Analyse languages used in a folder. Powered by GitHub Linguist, although it doesn't need to be installed.

github.com/Nixinova/Linguist

Nixinova/Linguist

485 lines (484 loc) • 25.3 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const js_yaml_1 = __importDefault(require("js-yaml")); const ignore_1 = __importDefault(require("ignore")); const common_path_prefix_1 = __importDefault(require("common-path-prefix")); const binary_extensions_1 = __importDefault(require("binary-extensions")); const isbinaryfile_1 = require("isbinaryfile"); const walk_tree_1 = __importDefault(require("./helpers/walk-tree")); const load_data_1 = __importStar(require("./helpers/load-data")); const read_file_1 = __importDefault(require("./helpers/read-file")); const parse_gitattributes_1 = __importDefault(require("./helpers/parse-gitattributes")); const convert_pcre_1 = __importDefault(require("./helpers/convert-pcre")); const norm_path_1 = require("./helpers/norm-path"); async function analyse(rawPaths, opts = {}) { var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p, _q, _r, _s, _t; var _u, _v; const useRawContent = opts.fileContent !== undefined; const input = [rawPaths !== null && rawPaths !== void 0 ? rawPaths : []].flat(); const manualFileContent = [(_a = opts.fileContent) !== null && _a !== void 0 ? _a : []].flat(); // Normalise input option arguments opts = { calculateLines: (_b = opts.calculateLines) !== null && _b !== void 0 ? _b : true, // default to true if unset checkIgnored: !opts.quick, checkDetected: !opts.quick, checkAttributes: !opts.quick, checkHeuristics: !opts.quick, checkShebang: !opts.quick, checkModeline: !opts.quick, ...opts, }; // Load data from github-linguist web repo const langData = await (0, load_data_1.default)('languages.yml', opts.offline).then(js_yaml_1.default.load); const vendorData = await (0, load_data_1.default)('vendor.yml', opts.offline).then(js_yaml_1.default.load); const docData = await (0, load_data_1.default)('documentation.yml', opts.offline).then(js_yaml_1.default.load); const heuristicsData = await (0, load_data_1.default)('heuristics.yml', opts.offline).then(js_yaml_1.default.load); const generatedData = await (0, load_data_1.default)('generated.rb', opts.offline).then(load_data_1.parseGeneratedDataFile); const vendorPaths = [...vendorData, ...docData, ...generatedData]; // Setup main variables const fileAssociations = {}; const extensions = {}; const globOverrides = {}; const results = { files: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, results: {}, alternatives: {} }, languages: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, results: {} }, unknown: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, extensions: {}, filenames: {} }, }; // Set a common root path so that vendor paths do not incorrectly match parent folders const resolvedInput = input.map(path => (0, norm_path_1.normPath)(path_1.default.resolve(path))); const commonRoot = (input.length > 1 ? (0, common_path_prefix_1.default)(resolvedInput) : resolvedInput[0]).replace(/\/?$/, ''); const relPath = (file) => useRawContent ? file : (0, norm_path_1.normPath)(path_1.default.relative(commonRoot, file)); const unRelPath = (file) => useRawContent ? file : (0, norm_path_1.normPath)(path_1.default.resolve(commonRoot, file)); // Other helper functions const fileMatchesGlobs = (file, ...globs) => (0, ignore_1.default)().add(globs).ignores(relPath(file)); const filterOutIgnored = (files, ignored) => ignored.filter(files.map(relPath)).map(unRelPath); //*PREPARE FILES AND DATA*// // Prepare list of ignored files const ignored = (0, ignore_1.default)(); ignored.add('.git/'); ignored.add((_c = opts.ignoredFiles) !== null && _c !== void 0 ? _c : []); const regexIgnores = opts.keepVendored ? [] : vendorPaths.map(path => RegExp(path, 'i')); // Load file paths and folders let files; if (useRawContent) { // Uses raw file content files = input; } else { // Uses directory on disc const data = (0, walk_tree_1.default)({ init: true, commonRoot, folderRoots: resolvedInput, folders: resolvedInput, ignored }); files = data.files; } // Fetch and normalise gitattributes data of all subfolders and save to metadata const manualAttributes = {}; // Maps file globs to gitattribute boolean flags const getFlaggedGlobs = (attr, val) => { return Object.entries(manualAttributes).filter(([, attrs]) => attrs[attr] === val).map(([glob,]) => glob); }; const findAttrsForPath = (filePath) => { const resultAttrs = {}; for (const glob in manualAttributes) { if ((0, ignore_1.default)().add(glob).ignores(relPath(filePath))) { const matchingAttrs = manualAttributes[glob]; for (const [attr, val] of Object.entries(matchingAttrs)) { if (val !== null) resultAttrs[attr] = val; } } } if (!JSON.stringify(resultAttrs)) { return null; } return resultAttrs; }; if (!useRawContent && opts.checkAttributes) { const nestedAttrFiles = files.filter(file => file.endsWith('.gitattributes')); for (const attrFile of nestedAttrFiles) { const relAttrFile = relPath(attrFile); const relAttrFolder = path_1.default.dirname(relAttrFile); const contents = await (0, read_file_1.default)(attrFile); const parsed = (0, parse_gitattributes_1.default)(contents, relAttrFolder); for (const { glob, attrs } of parsed) { manualAttributes[glob] = attrs; } } } // Remove files that are linguist-ignored via regex by default unless explicitly unignored in gitattributes const filesToIgnore = []; for (const file of files) { const relFile = relPath(file); const isRegexIgnored = regexIgnores.some(pattern => pattern.test(relFile)); if (!isRegexIgnored) { // Checking overrides is moot if file is not even marked as ignored by default continue; } const fileAttrs = findAttrsForPath(file); if ((fileAttrs === null || fileAttrs === void 0 ? void 0 : fileAttrs.generated) === false || (fileAttrs === null || fileAttrs === void 0 ? void 0 : fileAttrs.vendored) === false) { // File is explicitly marked as *not* to be ignored // do nothing } else { filesToIgnore.push(file); } } files = files.filter(file => !filesToIgnore.includes(file)); // Apply vendor file path matches and filter out vendored files if (!opts.keepVendored) { // Get data of files that have been manually marked with metadata const vendorTrueGlobs = [...getFlaggedGlobs('vendored', true), ...getFlaggedGlobs('generated', true), ...getFlaggedGlobs('documentation', true)]; const vendorFalseGlobs = [...getFlaggedGlobs('vendored', false), ...getFlaggedGlobs('generated', false), ...getFlaggedGlobs('documentation', false)]; // Set up glob ignore object to use for expanding globs to match files const vendorTrueIgnore = (0, ignore_1.default)().add(vendorTrueGlobs); const vendorFalseIgnore = (0, ignore_1.default)().add(vendorFalseGlobs); // Remove all files marked as vendored by default const excludedFiles = files.filter(file => vendorPaths.some(pathPtn => RegExp(pathPtn, 'i').test(relPath(file)))); files = files.filter(file => !excludedFiles.includes(file)); // Re-add removed files that are overridden manually in gitattributes const overriddenExcludedFiles = excludedFiles.filter(file => vendorFalseIgnore.ignores(relPath(file))); files.push(...overriddenExcludedFiles); // Remove files explicitly marked as vendored in gitattributes files = files.filter(file => !vendorTrueIgnore.ignores(relPath(file))); } // Filter out binary files if (!opts.keepBinary) { // Filter out files that are binary by default files = files.filter(file => !binary_extensions_1.default.some(ext => file.endsWith('.' + ext))); // Filter out manually specified binary files const binaryIgnored = (0, ignore_1.default)().add(getFlaggedGlobs('binary', true)); files = filterOutIgnored(files, binaryIgnored); // Re-add files manually marked not as binary const binaryUnignored = (0, ignore_1.default)().add(getFlaggedGlobs('binary', false)); const unignoredList = filterOutIgnored(files, binaryUnignored); files.push(...unignoredList); } // Ignore specific languages for (const lang of (_d = opts.ignoredLanguages) !== null && _d !== void 0 ? _d : []) { for (const key in langData) { if (lang.toLowerCase() === key.toLowerCase()) { delete langData[key]; break; } } } // Establish language overrides taken from gitattributes const forcedLangs = Object.entries(manualAttributes).filter(([, attrs]) => attrs.language); for (const [globPath, attrs] of forcedLangs) { let forcedLang = attrs.language; if (!forcedLang) continue; // If specified language is an alias, associate it with its full name if (!langData[forcedLang]) { const overrideLang = Object.entries(langData).find(entry => { var _a; return (_a = entry[1].aliases) === null || _a === void 0 ? void 0 : _a.includes(forcedLang.toLowerCase()); }); if (overrideLang) { forcedLang = overrideLang[0]; } } globOverrides[globPath] = forcedLang; } //*PARSE LANGUAGES*// const addResult = (file, result) => { if (!fileAssociations[file]) { fileAssociations[file] = []; extensions[file] = ''; } // Set parent to result group if it is present // Is nullish if either `opts.childLanguages` is set or if there is no group const finalResult = !opts.childLanguages && result && langData[result] && langData[result].group || result; if (!fileAssociations[file].includes(finalResult)) { fileAssociations[file].push(finalResult); } extensions[file] = path_1.default.extname(file).toLowerCase(); }; const definiteness = {}; const fromShebang = {}; fileLoop: for (const file of files) { // Check manual override for (const globMatch in globOverrides) { if (!fileMatchesGlobs(file, globMatch)) continue; // If the given file matches the glob, apply the override to the file const forcedLang = globOverrides[globMatch]; addResult(file, forcedLang); definiteness[file] = true; continue fileLoop; // no need to check other heuristics, the classified language has been found } // Check first line for readability let firstLine; if (useRawContent) { firstLine = (_f = (_e = manualFileContent[files.indexOf(file)]) === null || _e === void 0 ? void 0 : _e.split('\n')[0]) !== null && _f !== void 0 ? _f : null; } else if (fs_1.default.existsSync(file) && !fs_1.default.lstatSync(file).isDirectory()) { firstLine = await (0, read_file_1.default)(file, true).catch(() => null); } else continue; // Skip if file is unreadable or blank if (firstLine === null) continue; // Check first line for explicit classification const hasShebang = opts.checkShebang && /^#!/.test(firstLine); const hasModeline = opts.checkModeline && /-\*-|(syntax|filetype|ft)\s*=/.test(firstLine); if (!opts.quick && (hasShebang || hasModeline)) { const matches = []; for (const [lang, data] of Object.entries(langData)) { const langMatcher = (lang) => `\\b${lang.toLowerCase().replace(/\W/g, '\\$&')}(?![\\w#+*]|-\*-)`; // Check for interpreter match if (opts.checkShebang && hasShebang) { const matchesInterpretor = (_g = data.interpreters) === null || _g === void 0 ? void 0 : _g.some(interpreter => firstLine.match(`\\b${interpreter}\\b`)); if (matchesInterpretor) matches.push(lang); } // Check modeline declaration if (opts.checkModeline && hasModeline) { const modelineText = firstLine.toLowerCase().replace(/^.*-\*-(.+)-\*-.*$/, '$1'); const matchesLang = modelineText.match(langMatcher(lang)); const matchesAlias = (_h = data.aliases) === null || _h === void 0 ? void 0 : _h.some(lang => modelineText.match(langMatcher(lang))); if (matchesLang || matchesAlias) matches.push(lang); } } // Add identified language(s) if (matches.length) { for (const match of matches) addResult(file, match); if (matches.length === 1) definiteness[file] = true; fromShebang[file] = true; continue; } } // Search each language let skipExts = false; // Check if filename is a match for (const lang in langData) { const matchesName = (_j = langData[lang].filenames) === null || _j === void 0 ? void 0 : _j.some(name => path_1.default.basename(file.toLowerCase()) === name.toLowerCase()); if (matchesName) { addResult(file, lang); skipExts = true; } } // Check if extension is a match const possibleExts = []; if (!skipExts) for (const lang in langData) { const extMatches = (_k = langData[lang].extensions) === null || _k === void 0 ? void 0 : _k.filter(ext => file.toLowerCase().endsWith(ext.toLowerCase())); if (extMatches === null || extMatches === void 0 ? void 0 : extMatches.length) { for (const ext of extMatches) possibleExts.push({ ext, lang }); } } // Apply more specific extension if available const isComplexExt = (ext) => /\..+\./.test(ext); const hasComplexExt = possibleExts.some(data => isComplexExt(data.ext)); for (const { ext, lang } of possibleExts) { if (hasComplexExt && !isComplexExt(ext)) continue; if (!hasComplexExt && isComplexExt(ext)) continue; addResult(file, lang); } // Fallback to null if no language matches if (!fileAssociations[file]) { addResult(file, null); } } // Narrow down file associations to the best fit for (const file in fileAssociations) { // Skip if file has explicit association if (definiteness[file]) { results.files.results[file] = fileAssociations[file][0]; continue; } // Skip binary files if (!useRawContent && !opts.keepBinary) { if (await (0, isbinaryfile_1.isBinaryFile)(file)) continue; } // Parse heuristics if applicable if (opts.checkHeuristics) for (const heuristics of heuristicsData.disambiguations) { // Make sure the extension matches the current file if (!fromShebang[file] && !heuristics.extensions.includes(extensions[file])) continue; // Load heuristic rules for (const heuristic of heuristics.rules) { // Make sure the language is not an array if (Array.isArray(heuristic.language)) { heuristic.language = heuristic.language[0]; } // Make sure the results includes this language const languageGroup = (_l = langData[heuristic.language]) === null || _l === void 0 ? void 0 : _l.group; const matchesLang = fileAssociations[file].includes(heuristic.language); const matchesParent = languageGroup && fileAssociations[file].includes(languageGroup); if (!matchesLang && !matchesParent) continue; // Normalise heuristic data const patterns = []; const normalise = (contents) => patterns.push(...[contents].flat()); if (heuristic.pattern) normalise(heuristic.pattern); if (heuristic.named_pattern) normalise(heuristicsData.named_patterns[heuristic.named_pattern]); if (heuristic.and) { for (const data of heuristic.and) { if (data.pattern) normalise(data.pattern); if (data.named_pattern) normalise(heuristicsData.named_patterns[data.named_pattern]); } } // Check file contents and apply heuristic patterns const fileContent = opts.fileContent ? manualFileContent[files.indexOf(file)] : await (0, read_file_1.default)(file).catch(() => null); // Skip if file read errors if (fileContent === null) continue; // Apply heuristics if (!patterns.length || patterns.some(pattern => (0, convert_pcre_1.default)(pattern).test(fileContent))) { results.files.results[file] = heuristic.language; break; } } } // If no heuristics, assign a language if (!results.files.results[file]) { const possibleLangs = fileAssociations[file]; // Assign first language as a default option const defaultLang = possibleLangs[0]; const alternativeLangs = possibleLangs.slice(1); results.files.results[file] = defaultLang; // List alternative languages if there are any if (alternativeLangs.length > 0) results.files.alternatives[file] = alternativeLangs; } } // Skip specified categories if ((_m = opts.categories) === null || _m === void 0 ? void 0 : _m.length) { const categories = ['data', 'markup', 'programming', 'prose']; const hiddenCategories = categories.filter(cat => !opts.categories.includes(cat)); for (const [file, lang] of Object.entries(results.files.results)) { // Skip if language is not hidden if (!hiddenCategories.some(cat => { var _a; return lang && ((_a = langData[lang]) === null || _a === void 0 ? void 0 : _a.type) === cat; })) continue; // Skip if language is forced as detectable if (opts.checkDetected) { const detectable = (0, ignore_1.default)().add(getFlaggedGlobs('detectable', true)); if (detectable.ignores(relPath(file))) continue; } // Delete result otherwise delete results.files.results[file]; if (lang) delete results.languages.results[lang]; } for (const category of hiddenCategories) { for (const [lang, { type }] of Object.entries(results.languages.results)) { if (type === category) { delete results.languages.results[lang]; } } } } // Convert paths to relative if (!useRawContent && opts.relativePaths) { const newMap = {}; for (const [file, lang] of Object.entries(results.files.results)) { let relPath = (0, norm_path_1.normPath)(path_1.default.relative(process.cwd(), file)); if (!relPath.startsWith('../')) { relPath = './' + relPath; } newMap[relPath] = lang; } results.files.results = newMap; } // Load language bytes size for (const [file, lang] of Object.entries(results.files.results)) { if (lang && !langData[lang]) continue; // Calculate file size const fileSize = (_p = (_o = manualFileContent[files.indexOf(file)]) === null || _o === void 0 ? void 0 : _o.length) !== null && _p !== void 0 ? _p : fs_1.default.statSync(file).size; // Calculate lines of code const loc = { total: 0, content: 0, code: 0 }; if (opts.calculateLines) { const fileContent = (_r = ((_q = manualFileContent[files.indexOf(file)]) !== null && _q !== void 0 ? _q : fs_1.default.readFileSync(file).toString())) !== null && _r !== void 0 ? _r : ''; const allLines = fileContent.split(/\r?\n/gm); loc.total = allLines.length; loc.content = allLines.filter(line => line.trim().length > 0).length; const codeLines = fileContent .replace(/^\s*(\/\/|# |;|--).+/gm, '') .replace(/\/\*.+\*\/|/sg, ''); loc.code = codeLines.split(/\r?\n/gm).filter(line => line.trim().length > 0).length; } // Apply to files totals results.files.bytes += fileSize; results.files.lines.total += loc.total; results.files.lines.content += loc.content; results.files.lines.code += loc.code; // Add results to 'languages' section if language match found, or 'unknown' section otherwise if (lang) { const { type } = langData[lang]; // set default if unset (_s = (_u = results.languages.results)[lang]) !== null && _s !== void 0 ? _s : (_u[lang] = { type, bytes: 0, lines: { total: 0, content: 0, code: 0 }, color: langData[lang].color }); // apply results to 'languages' section if (opts.childLanguages) { results.languages.results[lang].parent = langData[lang].group; } results.languages.results[lang].bytes += fileSize; results.languages.bytes += fileSize; results.languages.results[lang].lines.total += loc.total; results.languages.results[lang].lines.content += loc.content; results.languages.results[lang].lines.code += loc.code; results.languages.lines.total += loc.total; results.languages.lines.content += loc.content; results.languages.lines.code += loc.code; } else { const ext = path_1.default.extname(file); const unknownType = ext ? 'extensions' : 'filenames'; const name = ext || path_1.default.basename(file); // apply results to 'unknown' section (_t = (_v = results.unknown[unknownType])[name]) !== null && _t !== void 0 ? _t : (_v[name] = 0); results.unknown[unknownType][name] += fileSize; results.unknown.bytes += fileSize; results.unknown.lines.total += loc.total; results.unknown.lines.content += loc.content; results.unknown.lines.code += loc.code; } } // Set lines output to NaN when line calculation is disabled if (opts.calculateLines === false) { results.files.lines = { total: NaN, content: NaN, code: NaN }; } // Set counts results.files.count = Object.keys(results.files.results).length; results.languages.count = Object.keys(results.languages.results).length; results.unknown.count = Object.keys({ ...results.unknown.extensions, ...results.unknown.filenames }).length; // Return return results; } module.exports = analyse;