linguist-js
Version:
Analyse the programming languages used in a folder or from raw content, using the same rules that GitHub Linguist does.
471 lines (470 loc) • 22.4 kB
JavaScript
import commonPrefix from 'common-path-prefix';
import ignore from 'ignore';
import { isBinaryFile } from 'isbinaryfile';
import YAML from 'js-yaml';
import FS from 'node:fs';
import Path from 'node:path';
import loadFile, { parseGeneratedDataFile } from './program/data/loadData.js';
import { normPath } from './program/fs/normalisedPath.js';
import readFileChunk from './program/fs/readFile.js';
import walkTree from './program/fs/walkTree.js';
import parseGitattributes from './program/parsing/parseGitattributes.js';
import pcre from './program/utils/pcre.js';
const binaryData = JSON.parse(FS.readFileSync(new URL('../node_modules/binary-extensions/binary-extensions.json', import.meta.url), 'utf-8'));
async function analyse(rawInput, opts = {}) {
const inputs = {
path: typeof rawInput === 'string' ? rawInput : null,
paths: Array.isArray(rawInput) ? rawInput : null,
content: typeof rawInput === 'object' && !Array.isArray(rawInput) ? rawInput : null,
};
const inputPaths = inputs.paths ?? (inputs.path ? [inputs.path] : null);
const inputContent = inputs.content;
const useRawContent = inputContent !== null;
const input = useRawContent ? Object.keys(inputContent) : (inputPaths ?? []);
// Normalise input option arguments
opts = {
calculateLines: opts.calculateLines ?? true, // default to true if unset
checkIgnored: !opts.quick,
checkDetected: !opts.quick,
checkAttributes: !opts.quick,
checkHeuristics: !opts.quick,
checkShebang: !opts.quick,
checkModeline: !opts.quick,
...opts,
};
// Load data from github-linguist web repo
const langData = await loadFile('languages.yml', opts.offline).then(YAML.load);
const vendorData = await loadFile('vendor.yml', opts.offline).then(YAML.load);
const docData = await loadFile('documentation.yml', opts.offline).then(YAML.load);
const heuristicsData = await loadFile('heuristics.yml', opts.offline).then(YAML.load);
const generatedData = await loadFile('generated.rb', opts.offline).then(parseGeneratedDataFile);
const vendorPaths = [...vendorData, ...docData, ...generatedData];
// Setup main variables
const fileAssociations = {};
const extensions = {};
const globOverrides = {};
const results = {
files: { count: 0, bytes: 0, lines: { total: 0, content: 0 }, results: {}, alternatives: {} },
languages: { count: 0, bytes: 0, lines: { total: 0, content: 0 }, results: {} },
unknown: { count: 0, bytes: 0, lines: { total: 0, content: 0 }, extensions: {}, filenames: {} },
repository: {},
};
// Set a common root path so that vendor paths do not incorrectly match parent folders
const resolvedInput = input.map((path) => normPath(Path.resolve(path)));
const commonRoot = (input.length > 1 ? commonPrefix(resolvedInput) : resolvedInput[0]).replace(/\/?$/, '');
const relPath = (file) => (useRawContent ? file : normPath(Path.relative(commonRoot, file)));
const unRelPath = (file) => (useRawContent ? file : normPath(Path.resolve(commonRoot, file)));
// Other helper functions
const fileMatchesGlobs = (file, ...globs) => ignore().add(globs).ignores(relPath(file));
const filterOutIgnored = (files, ignored) => ignored.filter(files.map(relPath)).map(unRelPath);
//*PREPARE FILES AND DATA*//
// Prepare list of ignored files
const ignored = ignore();
ignored.add('.git/');
ignored.add(opts.ignoredFiles ?? []);
const regexIgnores = opts.keepVendored ? [] : vendorPaths.map((path) => RegExp(path, 'i'));
// Load file paths and folders
let files;
if (useRawContent) {
// Uses raw file content
files = input;
}
else {
// Uses directory on disc
const data = walkTree({ init: true, commonRoot, folderRoots: resolvedInput, folders: resolvedInput, ignored });
files = data.files;
}
// Fetch and normalise gitattributes data of all subfolders and save to metadata
const manualAttributes = {}; // Maps file globs to gitattribute boolean flags
const getFlaggedGlobs = (attr, val) => {
return Object.entries(manualAttributes)
.filter(([, attrs]) => attrs[attr] === val)
.map(([glob]) => glob);
};
const findAttrsForPath = (filePath) => {
const resultAttrs = {};
for (const glob in manualAttributes) {
if (ignore().add(glob).ignores(relPath(filePath))) {
const matchingAttrs = manualAttributes[glob];
for (const [attr, val] of Object.entries(matchingAttrs)) {
if (val !== null)
resultAttrs[attr] = val;
}
}
}
if (!JSON.stringify(resultAttrs)) {
return null;
}
return resultAttrs;
};
if (!useRawContent && opts.checkAttributes) {
const nestedAttrFiles = files.filter((file) => file.endsWith('.gitattributes'));
for (const attrFile of nestedAttrFiles) {
const relAttrFile = relPath(attrFile);
const relAttrFolder = Path.dirname(relAttrFile);
const contents = await readFileChunk(attrFile);
const parsed = parseGitattributes(contents, relAttrFolder);
for (const { glob, attrs } of parsed) {
manualAttributes[glob] = attrs;
}
}
}
// Remove files that are linguist-ignored via regex by default unless explicitly unignored in gitattributes
const filesToIgnore = [];
for (const file of files) {
const relFile = relPath(file);
const isRegexIgnored = regexIgnores.some((pattern) => pattern.test(relFile));
if (!isRegexIgnored) {
// Checking overrides is moot if file is not even marked as ignored by default
continue;
}
const fileAttrs = findAttrsForPath(file);
if (fileAttrs?.generated === false || fileAttrs?.vendored === false) {
// File is explicitly marked as *not* to be ignored
// do nothing
}
else {
filesToIgnore.push(file);
}
}
files = files.filter((file) => !filesToIgnore.includes(file));
// Apply vendor file path matches and filter out vendored files
if (!opts.keepVendored) {
// Get data of files that have been manually marked with metadata
const vendorTrueGlobs = [
...getFlaggedGlobs('vendored', true),
...getFlaggedGlobs('generated', true),
...getFlaggedGlobs('documentation', true),
];
const vendorFalseGlobs = [
...getFlaggedGlobs('vendored', false),
...getFlaggedGlobs('generated', false),
...getFlaggedGlobs('documentation', false),
];
// Set up glob ignore object to use for expanding globs to match files
const vendorTrueIgnore = ignore().add(vendorTrueGlobs);
const vendorFalseIgnore = ignore().add(vendorFalseGlobs);
// Remove all files marked as vendored by default
const excludedFiles = files.filter((file) => vendorPaths.some((pathPtn) => RegExp(pathPtn, 'i').test(relPath(file))));
files = files.filter((file) => !excludedFiles.includes(file));
// Re-add removed files that are overridden manually in gitattributes
const overriddenExcludedFiles = excludedFiles.filter((file) => vendorFalseIgnore.ignores(relPath(file)));
files.push(...overriddenExcludedFiles);
// Remove files explicitly marked as vendored in gitattributes
files = files.filter((file) => !vendorTrueIgnore.ignores(relPath(file)));
}
// Filter out binary files
if (!opts.keepBinary) {
// Filter out files that are binary by default
files = files.filter((file) => !binaryData.some((ext) => file.endsWith('.' + ext)));
// Filter out manually specified binary files
const binaryIgnored = ignore().add(getFlaggedGlobs('binary', true));
files = filterOutIgnored(files, binaryIgnored);
// Re-add files manually marked not as binary
const binaryUnignored = ignore().add(getFlaggedGlobs('binary', false));
const unignoredList = filterOutIgnored(files, binaryUnignored);
files.push(...unignoredList);
}
// Ignore specific languages
for (const lang of opts.ignoredLanguages ?? []) {
for (const key in langData) {
if (lang.toLowerCase() === key.toLowerCase()) {
delete langData[key];
break;
}
}
}
// Establish language overrides taken from gitattributes
const forcedLangs = Object.entries(manualAttributes).filter(([, attrs]) => attrs.language);
for (const [globPath, attrs] of forcedLangs) {
let forcedLang = attrs.language;
if (!forcedLang)
continue;
// If specified language is an alias, associate it with its full name
if (!langData[forcedLang]) {
const overrideLang = Object.entries(langData).find((entry) => entry[1].aliases?.includes(forcedLang.toLowerCase()));
if (overrideLang) {
forcedLang = overrideLang[0];
}
}
globOverrides[globPath] = forcedLang;
}
//*PARSE LANGUAGES*//
const addResult = (file, result) => {
if (!fileAssociations[file]) {
fileAssociations[file] = [];
extensions[file] = '';
}
// Set parent to result group if it is present
// Is nullish if either `opts.childLanguages` is set or if there is no group
const finalResult = (!opts.childLanguages && result && langData[result] && langData[result].group) || result;
if (!fileAssociations[file].includes(finalResult)) {
fileAssociations[file].push(finalResult);
}
extensions[file] = Path.extname(file).toLowerCase();
};
const definiteness = {};
const fromShebang = {};
fileLoop: for (const file of files) {
// Check manual override
for (const globMatch in globOverrides) {
if (!fileMatchesGlobs(file, globMatch))
continue;
// If the given file matches the glob, apply the override to the file
const forcedLang = globOverrides[globMatch];
addResult(file, forcedLang);
definiteness[file] = true;
continue fileLoop; // no need to check other heuristics, the classified language has been found
}
// Check first line for readability
let firstLine;
if (useRawContent) {
firstLine = inputContent[file]?.split('\n')[0] ?? null;
}
else if (FS.existsSync(file) && !FS.lstatSync(file).isDirectory()) {
firstLine = await readFileChunk(file, true).catch(() => null);
}
else
continue;
// Skip if file is unreadable or blank
if (firstLine === null)
continue;
// Check first line for explicit classification
const modelineRegex = /-\*-|(?:syntax|filetype|ft)\s*=/;
const hasShebang = opts.checkShebang && /^#!/.test(firstLine);
const hasModeline = opts.checkModeline && modelineRegex.test(firstLine);
if (!opts.quick && (hasShebang || hasModeline)) {
const matches = [];
for (const [lang, data] of Object.entries(langData)) {
const langMatcher = (lang) => `\\b${lang.toLowerCase().replace(/\W/g, '\\$&')}(?![\\w#+*]|-\*-)`;
// Check for interpreter match
if (opts.checkShebang && hasShebang) {
const matchesInterpretor = data.interpreters?.some((interpreter) => firstLine.match(`\\b${interpreter}\\b`));
if (matchesInterpretor)
matches.push(lang);
}
// Check modeline declaration
if (opts.checkModeline && hasModeline) {
const modelineText = firstLine.toLowerCase().split(modelineRegex)[1];
const matchesLang = modelineText.match(langMatcher(lang));
const matchesAlias = data.aliases?.some((lang) => modelineText.match(langMatcher(lang)));
if (matchesLang || matchesAlias)
matches.push(lang);
}
}
// Add identified language(s)
if (matches.length) {
for (const match of matches)
addResult(file, match);
if (matches.length === 1)
definiteness[file] = true;
fromShebang[file] = true;
continue;
}
}
// Search each language
let skipExts = false;
// Check if filename is a match
for (const lang in langData) {
const matchesName = langData[lang].filenames?.some((name) => Path.basename(file.toLowerCase()) === name.toLowerCase());
if (matchesName) {
addResult(file, lang);
skipExts = true;
}
}
// Check if extension is a match
const possibleExts = [];
if (!skipExts)
for (const lang in langData) {
const extMatches = langData[lang].extensions?.filter((ext) => file.toLowerCase().endsWith(ext.toLowerCase()));
if (extMatches?.length) {
for (const ext of extMatches)
possibleExts.push({ ext, lang });
}
}
// Apply more specific extension if available
const isComplexExt = (ext) => /\..+\./.test(ext);
const hasComplexExt = possibleExts.some((data) => isComplexExt(data.ext));
for (const { ext, lang } of possibleExts) {
if (hasComplexExt && !isComplexExt(ext))
continue;
if (!hasComplexExt && isComplexExt(ext))
continue;
addResult(file, lang);
}
// Fallback to null if no language matches
if (!fileAssociations[file]) {
addResult(file, null);
}
}
// Narrow down file associations to the best fit
for (const file in fileAssociations) {
// Skip if file has explicit association
if (definiteness[file]) {
results.files.results[file] = fileAssociations[file][0];
continue;
}
// Skip binary files
if (!useRawContent && !opts.keepBinary) {
if (await isBinaryFile(file))
continue;
}
// Parse heuristics if applicable
if (opts.checkHeuristics)
for (const heuristics of heuristicsData.disambiguations) {
// Make sure the extension matches the current file
if (!fromShebang[file] && !heuristics.extensions.includes(extensions[file]))
continue;
// Load heuristic rules
for (const heuristic of heuristics.rules) {
// Make sure the language is not an array
if (Array.isArray(heuristic.language)) {
heuristic.language = heuristic.language[0];
}
// Make sure the results includes this language
const languageGroup = langData[heuristic.language]?.group;
const matchesLang = fileAssociations[file].includes(heuristic.language);
const matchesParent = languageGroup && fileAssociations[file].includes(languageGroup);
if (!matchesLang && !matchesParent)
continue;
// Normalise heuristic data
const patterns = [];
const normalise = (contents) => patterns.push(...[contents].flat());
if (heuristic.pattern)
normalise(heuristic.pattern);
if (heuristic.named_pattern)
normalise(heuristicsData.named_patterns[heuristic.named_pattern]);
if (heuristic.and) {
for (const data of heuristic.and) {
if (data.pattern)
normalise(data.pattern);
if (data.named_pattern)
normalise(heuristicsData.named_patterns[data.named_pattern]);
}
}
// Check file contents and apply heuristic patterns
const fileContent = useRawContent ? inputContent[file] : await readFileChunk(file).catch(() => null);
// Skip if file read errors
if (fileContent === null)
continue;
// Apply heuristics
if (!patterns.length || patterns.some((pattern) => pcre(pattern).test(fileContent))) {
results.files.results[file] = heuristic.language;
break;
}
}
}
// If no heuristics, assign a language
if (!results.files.results[file]) {
const possibleLangs = fileAssociations[file];
// Assign first language as a default option
const defaultLang = possibleLangs[0];
const alternativeLangs = possibleLangs.slice(1);
results.files.results[file] = defaultLang;
// List alternative languages if there are any
if (alternativeLangs.length > 0)
results.files.alternatives[file] = alternativeLangs;
}
}
// Skip specified categories
if (opts.categories?.length) {
const categories = ['data', 'markup', 'programming', 'prose'];
const hiddenCategories = categories.filter((cat) => !opts.categories.includes(cat));
for (const [file, lang] of Object.entries(results.files.results)) {
// Skip if language is not hidden
if (!hiddenCategories.some((cat) => lang && langData[lang]?.type === cat))
continue;
// Skip if language is forced as detectable
if (opts.checkDetected) {
const detectable = ignore().add(getFlaggedGlobs('detectable', true));
if (detectable.ignores(relPath(file)))
continue;
}
// Delete result otherwise
delete results.files.results[file];
if (lang)
delete results.languages.results[lang];
}
for (const category of hiddenCategories) {
for (const [lang, { type }] of Object.entries(results.repository)) {
if (type === category) {
delete results.languages.results[lang];
}
}
}
}
// Convert paths to relative
if (!useRawContent && opts.relativePaths) {
const newMap = {};
for (const [file, lang] of Object.entries(results.files.results)) {
let relPath = normPath(Path.relative(process.cwd(), file));
if (!relPath.startsWith('../')) {
relPath = './' + relPath;
}
newMap[relPath] = lang;
}
results.files.results = newMap;
}
// Load language bytes size
for (const [file, lang] of Object.entries(results.files.results)) {
if (lang && !langData[lang])
continue;
// Calculate file size
const fileSize = useRawContent ? inputContent[file]?.length : FS.statSync(file).size;
// Calculate lines of code
const loc = { total: 0, content: 0 };
if (opts.calculateLines) {
const fileContent = useRawContent ? inputContent[file] : FS.readFileSync(file).toString();
const allLines = fileContent.split(/\r?\n/gm);
loc.total = allLines.length;
loc.content = allLines.filter((line) => line.trim().length > 0).length;
}
// Apply to files totals
results.files.bytes += fileSize;
results.files.lines.total += loc.total;
results.files.lines.content += loc.content;
// Add results to 'languages' section if language match found, or 'unknown' section otherwise
if (lang) {
// update language in repository if not yet present
if (!results.repository[lang]) {
const { type, color } = langData[lang];
results.repository[lang] = { type, color };
if (opts.childLanguages) {
results.repository[lang].parent = langData[lang].group;
}
}
// set default if unset
results.languages.results[lang] ??= { count: 0, bytes: 0, lines: { total: 0, content: 0 } };
// apply results to 'languages' section
results.languages.results[lang].count++;
results.languages.results[lang].bytes += fileSize;
results.languages.bytes += fileSize;
results.languages.results[lang].lines.total += loc.total;
results.languages.results[lang].lines.content += loc.content;
results.languages.lines.total += loc.total;
results.languages.lines.content += loc.content;
}
else {
const ext = Path.extname(file);
const unknownType = ext ? 'extensions' : 'filenames';
const name = ext || Path.basename(file);
// apply results to 'unknown' section
results.unknown[unknownType][name] ??= 0;
results.unknown[unknownType][name] += fileSize;
results.unknown.bytes += fileSize;
results.unknown.lines.total += loc.total;
results.unknown.lines.content += loc.content;
}
}
// Set lines output to NaN when line calculation is disabled
if (opts.calculateLines === false) {
results.files.lines = { total: NaN, content: NaN };
}
// Set counts
results.files.count = Object.keys(results.files.results).length;
results.languages.count = Object.keys(results.languages.results).length;
results.unknown.count = Object.keys({ ...results.unknown.extensions, ...results.unknown.filenames }).length;
// Return
return results;
}
export default analyse;