linguist-js
Version:
Analyse languages used in a folder. Powered by GitHub Linguist, although it doesn't need to be installed.
485 lines (484 loc) • 25.3 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const js_yaml_1 = __importDefault(require("js-yaml"));
const ignore_1 = __importDefault(require("ignore"));
const common_path_prefix_1 = __importDefault(require("common-path-prefix"));
const binary_extensions_1 = __importDefault(require("binary-extensions"));
const isbinaryfile_1 = require("isbinaryfile");
const walk_tree_1 = __importDefault(require("./helpers/walk-tree"));
const load_data_1 = __importStar(require("./helpers/load-data"));
const read_file_1 = __importDefault(require("./helpers/read-file"));
const parse_gitattributes_1 = __importDefault(require("./helpers/parse-gitattributes"));
const convert_pcre_1 = __importDefault(require("./helpers/convert-pcre"));
const norm_path_1 = require("./helpers/norm-path");
async function analyse(rawPaths, opts = {}) {
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p, _q, _r, _s, _t;
var _u, _v;
const useRawContent = opts.fileContent !== undefined;
const input = [rawPaths !== null && rawPaths !== void 0 ? rawPaths : []].flat();
const manualFileContent = [(_a = opts.fileContent) !== null && _a !== void 0 ? _a : []].flat();
// Normalise input option arguments
opts = {
calculateLines: (_b = opts.calculateLines) !== null && _b !== void 0 ? _b : true, // default to true if unset
checkIgnored: !opts.quick,
checkDetected: !opts.quick,
checkAttributes: !opts.quick,
checkHeuristics: !opts.quick,
checkShebang: !opts.quick,
checkModeline: !opts.quick,
...opts,
};
// Load data from github-linguist web repo
const langData = await (0, load_data_1.default)('languages.yml', opts.offline).then(js_yaml_1.default.load);
const vendorData = await (0, load_data_1.default)('vendor.yml', opts.offline).then(js_yaml_1.default.load);
const docData = await (0, load_data_1.default)('documentation.yml', opts.offline).then(js_yaml_1.default.load);
const heuristicsData = await (0, load_data_1.default)('heuristics.yml', opts.offline).then(js_yaml_1.default.load);
const generatedData = await (0, load_data_1.default)('generated.rb', opts.offline).then(load_data_1.parseGeneratedDataFile);
const vendorPaths = [...vendorData, ...docData, ...generatedData];
// Setup main variables
const fileAssociations = {};
const extensions = {};
const globOverrides = {};
const results = {
files: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, results: {}, alternatives: {} },
languages: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, results: {} },
unknown: { count: 0, bytes: 0, lines: { total: 0, content: 0, code: 0 }, extensions: {}, filenames: {} },
};
// Set a common root path so that vendor paths do not incorrectly match parent folders
const resolvedInput = input.map(path => (0, norm_path_1.normPath)(path_1.default.resolve(path)));
const commonRoot = (input.length > 1 ? (0, common_path_prefix_1.default)(resolvedInput) : resolvedInput[0]).replace(/\/?$/, '');
const relPath = (file) => useRawContent ? file : (0, norm_path_1.normPath)(path_1.default.relative(commonRoot, file));
const unRelPath = (file) => useRawContent ? file : (0, norm_path_1.normPath)(path_1.default.resolve(commonRoot, file));
// Other helper functions
const fileMatchesGlobs = (file, ...globs) => (0, ignore_1.default)().add(globs).ignores(relPath(file));
const filterOutIgnored = (files, ignored) => ignored.filter(files.map(relPath)).map(unRelPath);
//*PREPARE FILES AND DATA*//
// Prepare list of ignored files
const ignored = (0, ignore_1.default)();
ignored.add('.git/');
ignored.add((_c = opts.ignoredFiles) !== null && _c !== void 0 ? _c : []);
const regexIgnores = opts.keepVendored ? [] : vendorPaths.map(path => RegExp(path, 'i'));
// Load file paths and folders
let files;
if (useRawContent) {
// Uses raw file content
files = input;
}
else {
// Uses directory on disc
const data = (0, walk_tree_1.default)({ init: true, commonRoot, folderRoots: resolvedInput, folders: resolvedInput, ignored });
files = data.files;
}
// Fetch and normalise gitattributes data of all subfolders and save to metadata
const manualAttributes = {}; // Maps file globs to gitattribute boolean flags
const getFlaggedGlobs = (attr, val) => {
return Object.entries(manualAttributes).filter(([, attrs]) => attrs[attr] === val).map(([glob,]) => glob);
};
const findAttrsForPath = (filePath) => {
const resultAttrs = {};
for (const glob in manualAttributes) {
if ((0, ignore_1.default)().add(glob).ignores(relPath(filePath))) {
const matchingAttrs = manualAttributes[glob];
for (const [attr, val] of Object.entries(matchingAttrs)) {
if (val !== null)
resultAttrs[attr] = val;
}
}
}
if (!JSON.stringify(resultAttrs)) {
return null;
}
return resultAttrs;
};
if (!useRawContent && opts.checkAttributes) {
const nestedAttrFiles = files.filter(file => file.endsWith('.gitattributes'));
for (const attrFile of nestedAttrFiles) {
const relAttrFile = relPath(attrFile);
const relAttrFolder = path_1.default.dirname(relAttrFile);
const contents = await (0, read_file_1.default)(attrFile);
const parsed = (0, parse_gitattributes_1.default)(contents, relAttrFolder);
for (const { glob, attrs } of parsed) {
manualAttributes[glob] = attrs;
}
}
}
// Remove files that are linguist-ignored via regex by default unless explicitly unignored in gitattributes
const filesToIgnore = [];
for (const file of files) {
const relFile = relPath(file);
const isRegexIgnored = regexIgnores.some(pattern => pattern.test(relFile));
if (!isRegexIgnored) {
// Checking overrides is moot if file is not even marked as ignored by default
continue;
}
const fileAttrs = findAttrsForPath(file);
if ((fileAttrs === null || fileAttrs === void 0 ? void 0 : fileAttrs.generated) === false || (fileAttrs === null || fileAttrs === void 0 ? void 0 : fileAttrs.vendored) === false) {
// File is explicitly marked as *not* to be ignored
// do nothing
}
else {
filesToIgnore.push(file);
}
}
files = files.filter(file => !filesToIgnore.includes(file));
// Apply vendor file path matches and filter out vendored files
if (!opts.keepVendored) {
// Get data of files that have been manually marked with metadata
const vendorTrueGlobs = [...getFlaggedGlobs('vendored', true), ...getFlaggedGlobs('generated', true), ...getFlaggedGlobs('documentation', true)];
const vendorFalseGlobs = [...getFlaggedGlobs('vendored', false), ...getFlaggedGlobs('generated', false), ...getFlaggedGlobs('documentation', false)];
// Set up glob ignore object to use for expanding globs to match files
const vendorTrueIgnore = (0, ignore_1.default)().add(vendorTrueGlobs);
const vendorFalseIgnore = (0, ignore_1.default)().add(vendorFalseGlobs);
// Remove all files marked as vendored by default
const excludedFiles = files.filter(file => vendorPaths.some(pathPtn => RegExp(pathPtn, 'i').test(relPath(file))));
files = files.filter(file => !excludedFiles.includes(file));
// Re-add removed files that are overridden manually in gitattributes
const overriddenExcludedFiles = excludedFiles.filter(file => vendorFalseIgnore.ignores(relPath(file)));
files.push(...overriddenExcludedFiles);
// Remove files explicitly marked as vendored in gitattributes
files = files.filter(file => !vendorTrueIgnore.ignores(relPath(file)));
}
// Filter out binary files
if (!opts.keepBinary) {
// Filter out files that are binary by default
files = files.filter(file => !binary_extensions_1.default.some(ext => file.endsWith('.' + ext)));
// Filter out manually specified binary files
const binaryIgnored = (0, ignore_1.default)().add(getFlaggedGlobs('binary', true));
files = filterOutIgnored(files, binaryIgnored);
// Re-add files manually marked not as binary
const binaryUnignored = (0, ignore_1.default)().add(getFlaggedGlobs('binary', false));
const unignoredList = filterOutIgnored(files, binaryUnignored);
files.push(...unignoredList);
}
// Ignore specific languages
for (const lang of (_d = opts.ignoredLanguages) !== null && _d !== void 0 ? _d : []) {
for (const key in langData) {
if (lang.toLowerCase() === key.toLowerCase()) {
delete langData[key];
break;
}
}
}
// Establish language overrides taken from gitattributes
const forcedLangs = Object.entries(manualAttributes).filter(([, attrs]) => attrs.language);
for (const [globPath, attrs] of forcedLangs) {
let forcedLang = attrs.language;
if (!forcedLang)
continue;
// If specified language is an alias, associate it with its full name
if (!langData[forcedLang]) {
const overrideLang = Object.entries(langData).find(entry => { var _a; return (_a = entry[1].aliases) === null || _a === void 0 ? void 0 : _a.includes(forcedLang.toLowerCase()); });
if (overrideLang) {
forcedLang = overrideLang[0];
}
}
globOverrides[globPath] = forcedLang;
}
//*PARSE LANGUAGES*//
const addResult = (file, result) => {
if (!fileAssociations[file]) {
fileAssociations[file] = [];
extensions[file] = '';
}
// Set parent to result group if it is present
// Is nullish if either `opts.childLanguages` is set or if there is no group
const finalResult = !opts.childLanguages && result && langData[result] && langData[result].group || result;
if (!fileAssociations[file].includes(finalResult)) {
fileAssociations[file].push(finalResult);
}
extensions[file] = path_1.default.extname(file).toLowerCase();
};
const definiteness = {};
const fromShebang = {};
fileLoop: for (const file of files) {
// Check manual override
for (const globMatch in globOverrides) {
if (!fileMatchesGlobs(file, globMatch))
continue;
// If the given file matches the glob, apply the override to the file
const forcedLang = globOverrides[globMatch];
addResult(file, forcedLang);
definiteness[file] = true;
continue fileLoop; // no need to check other heuristics, the classified language has been found
}
// Check first line for readability
let firstLine;
if (useRawContent) {
firstLine = (_f = (_e = manualFileContent[files.indexOf(file)]) === null || _e === void 0 ? void 0 : _e.split('\n')[0]) !== null && _f !== void 0 ? _f : null;
}
else if (fs_1.default.existsSync(file) && !fs_1.default.lstatSync(file).isDirectory()) {
firstLine = await (0, read_file_1.default)(file, true).catch(() => null);
}
else
continue;
// Skip if file is unreadable or blank
if (firstLine === null)
continue;
// Check first line for explicit classification
const hasShebang = opts.checkShebang && /^#!/.test(firstLine);
const hasModeline = opts.checkModeline && /-\*-|(syntax|filetype|ft)\s*=/.test(firstLine);
if (!opts.quick && (hasShebang || hasModeline)) {
const matches = [];
for (const [lang, data] of Object.entries(langData)) {
const langMatcher = (lang) => `\\b${lang.toLowerCase().replace(/\W/g, '\\$&')}(?![\\w#+*]|-\*-)`;
// Check for interpreter match
if (opts.checkShebang && hasShebang) {
const matchesInterpretor = (_g = data.interpreters) === null || _g === void 0 ? void 0 : _g.some(interpreter => firstLine.match(`\\b${interpreter}\\b`));
if (matchesInterpretor)
matches.push(lang);
}
// Check modeline declaration
if (opts.checkModeline && hasModeline) {
const modelineText = firstLine.toLowerCase().replace(/^.*-\*-(.+)-\*-.*$/, '$1');
const matchesLang = modelineText.match(langMatcher(lang));
const matchesAlias = (_h = data.aliases) === null || _h === void 0 ? void 0 : _h.some(lang => modelineText.match(langMatcher(lang)));
if (matchesLang || matchesAlias)
matches.push(lang);
}
}
// Add identified language(s)
if (matches.length) {
for (const match of matches)
addResult(file, match);
if (matches.length === 1)
definiteness[file] = true;
fromShebang[file] = true;
continue;
}
}
// Search each language
let skipExts = false;
// Check if filename is a match
for (const lang in langData) {
const matchesName = (_j = langData[lang].filenames) === null || _j === void 0 ? void 0 : _j.some(name => path_1.default.basename(file.toLowerCase()) === name.toLowerCase());
if (matchesName) {
addResult(file, lang);
skipExts = true;
}
}
// Check if extension is a match
const possibleExts = [];
if (!skipExts)
for (const lang in langData) {
const extMatches = (_k = langData[lang].extensions) === null || _k === void 0 ? void 0 : _k.filter(ext => file.toLowerCase().endsWith(ext.toLowerCase()));
if (extMatches === null || extMatches === void 0 ? void 0 : extMatches.length) {
for (const ext of extMatches)
possibleExts.push({ ext, lang });
}
}
// Apply more specific extension if available
const isComplexExt = (ext) => /\..+\./.test(ext);
const hasComplexExt = possibleExts.some(data => isComplexExt(data.ext));
for (const { ext, lang } of possibleExts) {
if (hasComplexExt && !isComplexExt(ext))
continue;
if (!hasComplexExt && isComplexExt(ext))
continue;
addResult(file, lang);
}
// Fallback to null if no language matches
if (!fileAssociations[file]) {
addResult(file, null);
}
}
// Narrow down file associations to the best fit
for (const file in fileAssociations) {
// Skip if file has explicit association
if (definiteness[file]) {
results.files.results[file] = fileAssociations[file][0];
continue;
}
// Skip binary files
if (!useRawContent && !opts.keepBinary) {
if (await (0, isbinaryfile_1.isBinaryFile)(file))
continue;
}
// Parse heuristics if applicable
if (opts.checkHeuristics)
for (const heuristics of heuristicsData.disambiguations) {
// Make sure the extension matches the current file
if (!fromShebang[file] && !heuristics.extensions.includes(extensions[file]))
continue;
// Load heuristic rules
for (const heuristic of heuristics.rules) {
// Make sure the language is not an array
if (Array.isArray(heuristic.language)) {
heuristic.language = heuristic.language[0];
}
// Make sure the results includes this language
const languageGroup = (_l = langData[heuristic.language]) === null || _l === void 0 ? void 0 : _l.group;
const matchesLang = fileAssociations[file].includes(heuristic.language);
const matchesParent = languageGroup && fileAssociations[file].includes(languageGroup);
if (!matchesLang && !matchesParent)
continue;
// Normalise heuristic data
const patterns = [];
const normalise = (contents) => patterns.push(...[contents].flat());
if (heuristic.pattern)
normalise(heuristic.pattern);
if (heuristic.named_pattern)
normalise(heuristicsData.named_patterns[heuristic.named_pattern]);
if (heuristic.and) {
for (const data of heuristic.and) {
if (data.pattern)
normalise(data.pattern);
if (data.named_pattern)
normalise(heuristicsData.named_patterns[data.named_pattern]);
}
}
// Check file contents and apply heuristic patterns
const fileContent = opts.fileContent ? manualFileContent[files.indexOf(file)] : await (0, read_file_1.default)(file).catch(() => null);
// Skip if file read errors
if (fileContent === null)
continue;
// Apply heuristics
if (!patterns.length || patterns.some(pattern => (0, convert_pcre_1.default)(pattern).test(fileContent))) {
results.files.results[file] = heuristic.language;
break;
}
}
}
// If no heuristics, assign a language
if (!results.files.results[file]) {
const possibleLangs = fileAssociations[file];
// Assign first language as a default option
const defaultLang = possibleLangs[0];
const alternativeLangs = possibleLangs.slice(1);
results.files.results[file] = defaultLang;
// List alternative languages if there are any
if (alternativeLangs.length > 0)
results.files.alternatives[file] = alternativeLangs;
}
}
// Skip specified categories
if ((_m = opts.categories) === null || _m === void 0 ? void 0 : _m.length) {
const categories = ['data', 'markup', 'programming', 'prose'];
const hiddenCategories = categories.filter(cat => !opts.categories.includes(cat));
for (const [file, lang] of Object.entries(results.files.results)) {
// Skip if language is not hidden
if (!hiddenCategories.some(cat => { var _a; return lang && ((_a = langData[lang]) === null || _a === void 0 ? void 0 : _a.type) === cat; }))
continue;
// Skip if language is forced as detectable
if (opts.checkDetected) {
const detectable = (0, ignore_1.default)().add(getFlaggedGlobs('detectable', true));
if (detectable.ignores(relPath(file)))
continue;
}
// Delete result otherwise
delete results.files.results[file];
if (lang)
delete results.languages.results[lang];
}
for (const category of hiddenCategories) {
for (const [lang, { type }] of Object.entries(results.languages.results)) {
if (type === category) {
delete results.languages.results[lang];
}
}
}
}
// Convert paths to relative
if (!useRawContent && opts.relativePaths) {
const newMap = {};
for (const [file, lang] of Object.entries(results.files.results)) {
let relPath = (0, norm_path_1.normPath)(path_1.default.relative(process.cwd(), file));
if (!relPath.startsWith('../')) {
relPath = './' + relPath;
}
newMap[relPath] = lang;
}
results.files.results = newMap;
}
// Load language bytes size
for (const [file, lang] of Object.entries(results.files.results)) {
if (lang && !langData[lang])
continue;
// Calculate file size
const fileSize = (_p = (_o = manualFileContent[files.indexOf(file)]) === null || _o === void 0 ? void 0 : _o.length) !== null && _p !== void 0 ? _p : fs_1.default.statSync(file).size;
// Calculate lines of code
const loc = { total: 0, content: 0, code: 0 };
if (opts.calculateLines) {
const fileContent = (_r = ((_q = manualFileContent[files.indexOf(file)]) !== null && _q !== void 0 ? _q : fs_1.default.readFileSync(file).toString())) !== null && _r !== void 0 ? _r : '';
const allLines = fileContent.split(/\r?\n/gm);
loc.total = allLines.length;
loc.content = allLines.filter(line => line.trim().length > 0).length;
const codeLines = fileContent
.replace(/^\s*(\/\/|# |;|--).+/gm, '')
.replace(/\/\*.+\*\/|<!--.+-->/sg, '');
loc.code = codeLines.split(/\r?\n/gm).filter(line => line.trim().length > 0).length;
}
// Apply to files totals
results.files.bytes += fileSize;
results.files.lines.total += loc.total;
results.files.lines.content += loc.content;
results.files.lines.code += loc.code;
// Add results to 'languages' section if language match found, or 'unknown' section otherwise
if (lang) {
const { type } = langData[lang];
// set default if unset
(_s = (_u = results.languages.results)[lang]) !== null && _s !== void 0 ? _s : (_u[lang] = { type, bytes: 0, lines: { total: 0, content: 0, code: 0 }, color: langData[lang].color });
// apply results to 'languages' section
if (opts.childLanguages) {
results.languages.results[lang].parent = langData[lang].group;
}
results.languages.results[lang].bytes += fileSize;
results.languages.bytes += fileSize;
results.languages.results[lang].lines.total += loc.total;
results.languages.results[lang].lines.content += loc.content;
results.languages.results[lang].lines.code += loc.code;
results.languages.lines.total += loc.total;
results.languages.lines.content += loc.content;
results.languages.lines.code += loc.code;
}
else {
const ext = path_1.default.extname(file);
const unknownType = ext ? 'extensions' : 'filenames';
const name = ext || path_1.default.basename(file);
// apply results to 'unknown' section
(_t = (_v = results.unknown[unknownType])[name]) !== null && _t !== void 0 ? _t : (_v[name] = 0);
results.unknown[unknownType][name] += fileSize;
results.unknown.bytes += fileSize;
results.unknown.lines.total += loc.total;
results.unknown.lines.content += loc.content;
results.unknown.lines.code += loc.code;
}
}
// Set lines output to NaN when line calculation is disabled
if (opts.calculateLines === false) {
results.files.lines = { total: NaN, content: NaN, code: NaN };
}
// Set counts
results.files.count = Object.keys(results.files.results).length;
results.languages.count = Object.keys(results.languages.results).length;
results.unknown.count = Object.keys({ ...results.unknown.extensions, ...results.unknown.filenames }).length;
// Return
return results;
}
module.exports = analyse;