lang-detector
Version:
A library for detecting the programming language of a code snippet.
408 lines (381 loc) • 13.2 kB
JavaScript
/**
* The MIT License (MIT)
*
* Copyright (c) 2015 Toni Sučić
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
var _ = require('underscore');
/**
* A checker is an object with the following form:
* { pattern: /something/, points: 1 }
* or if the pattern only matches code near the top of a given file:
* { pattern: /something/, points: 2, nearTop: true }
*
* Key: Language name.
* Value: Array of checkers.
*
* N.B. An array of checkers shouldn't contain more regexes than
* necessary as it would inhibit performance.
*
* Points scale:
* 2 = Bonus points: Almost unique to a given language.
* 1 = Regular point: Not unique to a given language.
* -1 = Penalty point: Does not match a given language.
* Rare:
* -50 = Bonus penalty points: Only used when two languages are mixed together,
* and one has a higher precedence over the other one.
*/
var languages = {
'JavaScript': [
// undefined keyword
{ pattern: /undefined/g, points: 2 },
// console.log('ayy lmao')
{ pattern: /console\.log( )*\(/, points: 2 },
// Variable declaration
{ pattern: /(var|const|let)( )+\w+( )*=?/, points: 2 },
// Array/Object declaration
{ pattern: /(('|").+('|")( )*|\w+):( )*[{\[]/, points: 2 },
// === operator
{ pattern: /===/g, points: 1 },
// !== operator
{ pattern: /!==/g, points: 1 },
// Function definition
{ pattern: /function\*?(( )+[\$\w]+( )*\(.*\)|( )*\(.*\))/g, points: 1 },
// null keyword
{ pattern: /null/g, points: 1 },
// lambda expression
{ pattern: /\(.*\)( )*=>( )*.+/, points: 1 },
// (else )if statement
{ pattern: /(else )?if( )+\(.+\)/, points: 1 },
// while loop
{ pattern: /while( )+\(.+\)/, points: 1 },
// C style variable declaration.
{ pattern: /(^|\s)(char|long|int|float|double)( )+\w+( )*=?/, points: -1 },
// pointer
{ pattern: /(\w+)( )*\*( )*\w+/, points: -1 },
// HTML <script> tag
{ pattern: /<(\/)?script( type=('|")text\/javascript('|"))?>/, points: -50 },
],
'C': [
// Primitive variable declaration.
{ pattern: /(char|long|int|float|double)( )+\w+( )*=?/, points: 2 },
// malloc function call
{ pattern: /malloc\(.+\)/, points: 2 },
// #include <whatever.h>
{ pattern: /#include (<|")\w+\.h(>|")/, points: 2, nearTop: true },
// pointer
{ pattern: /(\w+)( )*\*( )*\w+/, points: 2 },
// Variable declaration and/or initialisation.
{ pattern: /(\w+)( )+\w+(;|( )*=)/, points: 1 },
// Array declaration.
{ pattern: /(\w+)( )+\w+\[.+\]/, points: 1 },
// #define macro
{ pattern: /#define( )+.+/, points: 1 },
// NULL constant
{ pattern: /NULL/, points: 1 },
// void keyword
{ pattern: /void/g, points: 1 },
// (else )if statement
{ pattern: /(else )?if( )*\(.+\)/, points: 1 },
// while loop
{ pattern: /while( )+\(.+\)/, points: 1 },
// printf function
{ pattern: /(printf|puts)( )*\(.+\)/, points: 1 },
// new Keyword from C++
{ pattern: /new \w+/, points: -1 },
// Single quote multicharacter string
{ pattern: /'.{2,}'/, points: -1 },
// JS variable declaration
{ pattern: /var( )+\w+( )*=?/, points: -1 },
],
'C++': [
// Primitive variable declaration.
{ pattern: /(char|long|int|float|double)( )+\w+( )*=?/, points: 2 },
// #include <whatever.h>
{ pattern: /#include( )*(<|")\w+(\.h)?(>|")/, points: 2, nearTop: true },
// using namespace something
{ pattern: /using( )+namespace( )+.+( )*;/, points: 2 },
// template declaration
{ pattern: /template( )*<.*>/, points: 2 },
// std
{ pattern: /std::\w+/g, points: 2 },
// cout/cin/endl
{ pattern: /(cout|cin|endl)/g, points: 2 },
// Visibility specifiers
{ pattern: /(public|protected|private):/, points: 2 },
// nullptr
{ pattern: /nullptr/, points: 2 },
// new Keyword
{ pattern: /new \w+(\(.*\))?/, points: 1 },
// #define macro
{ pattern: /#define( )+.+/, points: 1 },
// template usage
{ pattern: /\w+<\w+>/, points: 1 },
// class keyword
{ pattern: /class( )+\w+/, points: 1 },
// void keyword
{ pattern: /void/g, points: 1 },
// (else )if statement
{ pattern: /(else )?if( )*\(.+\)/, points: 1 },
// while loop
{ pattern: /while( )+\(.+\)/, points: 1 },
// Scope operator
{ pattern: /\w*::\w+/, points: 1 },
// Single quote multicharacter string
{ pattern: /'.{2,}'/, points: -1 },
// Java List/ArrayList
{ pattern: /(List<\w+>|ArrayList<\w*>( )*\(.*\))(( )+[\w]+|;)/, points: -1 },
],
'Python': [
// Function definition
{ pattern: /def( )+\w+\(.*\)( )*:/, points: 2 },
// while loop
{ pattern: /while (.+):/, points: 2 },
// from library import something
{ pattern: /from [\w\.]+ import (\w+|\*)/, points: 2 },
// class keyword
{ pattern: /class( )*\w+(\(( )*\w+( )*\))?( )*:/, points: 2 },
// if keyword
{ pattern: /if( )+(.+)( )*:/, points: 2 },
// elif keyword
{ pattern: /elif( )+(.+)( )*:/, points: 2 },
// else keyword
{ pattern: /else:/, points: 2 },
// for loop
{ pattern: /for (\w+|\(?\w+,( )*\w+\)?) in (.+):/, points: 2 },
// Python variable declaration.
{ pattern: /\w+( )*=( )*\w+(?!;)(\n|$)/, points: 1 },
// import something
{ pattern: /import ([[^\.]\w])+/, points: 1, nearTop: true },
// print statement/function
{ pattern: /print((( )*\(.+\))|( )+.+)/, points: 1 },
// &&/|| operators
{ pattern: /(&{2}|\|{2})/, points: -1 },
],
'Java': [
// System.out.println() etc.
{ pattern: /System\.(in|out)\.\w+/, points: 2 },
// Class variable declarations
{ pattern: /(private|protected|public)( )*\w+( )*\w+(( )*=( )*[\w])?/, points: 2 },
// Method
{ pattern: /(private|protected|public)( )*\w+( )*[\w]+\(.+\)/, points: 2 },
// String class
{ pattern: /(^|\s)(String)( )+[\w]+( )*=?/, points: 2 },
// List/ArrayList
{ pattern: /(List<\w+>|ArrayList<\w*>( )*\(.*\))(( )+[\w]+|;)/, points: 2 },
// class keyword
{ pattern: /(public( )*)?class( )*\w+/, points: 2 },
// Array declaration.
{ pattern: /(\w+)(\[( )*\])+( )+\w+/, points: 2 },
// final keyword
{ pattern: /final( )*\w+/, points: 2 },
// getter & setter
{ pattern: /\w+\.(get|set)\(.+\)/, points: 2 },
// new Keyword (Java)
{ pattern: /new [A-Z]\w*( )*\(.+\)/, points: 2 },
// C style variable declaration.
{ pattern: /(^|\s)(char|long|int|float|double)( )+[\w]+( )*=?/, points: 1 },
// extends/implements keywords
{ pattern: /(extends|implements)/, points: 2, nearTop: true },
// null keyword
{ pattern: /null/g, points: 1 },
// (else )if statement
{ pattern: /(else )?if( )*\(.+\)/, points: 1 },
// while loop
{ pattern: /while( )+\(.+\)/, points: 1 },
// void keyword
{ pattern: /void/g, points: 1 },
// const
{ pattern: /const( )*\w+/, points: -1 },
// pointer
{ pattern: /(\w+)( )*\*( )*\w+/, points: -1 },
// Single quote multicharacter string
{ pattern: /'.{2,}'/, points: -1 },
// C style include
{ pattern: /#include( )*(<|")\w+(\.h)?(>|")/, points: -1, nearTop: true },
],
'HTML': [
{ pattern: /<!DOCTYPE (html|HTML PUBLIC .+)>/, points: 2, nearTop: true },
// Tags
{ pattern: /<[a-z0-9]+(( )*[\w]+=('|").+('|")( )*)?>.*<\/[a-z0-9]+>/g, points: 2 },
// Properties
{ pattern: /[a-z\-]+=("|').+("|')/g, points: 2 },
// PHP tag
{ pattern: /<\?php/, points: -50 },
],
'CSS': [
// Properties
{ pattern: /[a-z\-]+:(?!:).+;/, points: 2 },
// <style> tag from HTML
{ pattern: /<(\/)?style>/, points: -50 },
],
'Ruby': [
// require/include
{ pattern: /(require|include)( )+'\w+(\.rb)?'/, points: 2, nearTop: true },
// Function definition
{ pattern: /def( )+\w+( )*(\(.+\))?( )*\n/, points: 2 },
// Instance variables
{ pattern: /@\w+/, points: 2 },
// Boolean property
{ pattern: /\.\w+\?/, points: 2 },
// puts (Ruby print)
{ pattern: /puts( )+("|').+("|')/, points: 2 },
// Inheriting class
{ pattern: /class [A-Z]\w*( )*<( )*([A-Z]\w*(::)?)+/, points: 2 },
// attr_accessor
{ pattern: /attr_accessor( )+(:\w+(,( )*)?)+/, points: 2 },
// new
{ pattern: /\w+\.new( )+/, points: 2 },
// elsif keyword
{ pattern: /elsif/, points: 2 },
// do
{ pattern: /do( )*\|(\w+(,( )*\w+)?)+\|/, points: 2 },
// for loop
{ pattern: /for (\w+|\(?\w+,( )*\w+\)?) in (.+)/, points: 1 },
// nil keyword
{ pattern: /nil/, points: 1 },
// Scope operator
{ pattern: /[A-Z]\w*::[A-Z]\w*/, points: 1 },
],
'Go': [
// package something
{ pattern: /package( )+[a-z]+\n/, points: 2, nearTop: true },
// import
{ pattern: /(import( )*\(( )*\n)|(import( )+"[a-z0-9\/\.]+")/, points: 2, nearTop: true },
// error check
{ pattern: /if.+err( )*!=( )*nil.+{/, points: 2 },
// Go print
{ pattern: /fmt\.Print(f|ln)?\(.*\)/, points: 2 },
// function
{ pattern: /func(( )+\w+( )*)?\(.*\).*{/, points: 2 },
// variable initialisation
{ pattern: /\w+( )*:=( )*.+[^;\n]/, points: 2 },
// if/else if
{ pattern: /(}( )*else( )*)?if[^\(\)]+{/, points: 2 },
// var/const declaration
{ pattern: /(var|const)( )+\w+( )+[\w\*]+(\n|( )*=|$)/, points: 2 },
// public access on package
{ pattern: /[a-z]+\.[A-Z]\w*/, points: 1 },
// nil keyword
{ pattern: /nil/, points: 1 },
// Single quote multicharacter string
{ pattern: /'.{2,}'/, points: -1 },
],
'PHP': [
// PHP tag
{ pattern: /<\?php/, points: 2 },
// PHP style variables.
{ pattern: /\$\w+/, points: 2 },
// use Something\Something;
{ pattern: /use( )+\w+(\\\w+)+( )*;/, points: 2, nearTop: true },
// arrow
{ pattern: /\$\w+\->\w+/, points: 2 },
// require/include
{ pattern: /(require|include)(_once)?( )*\(?( )*('|").+\.php('|")( )*\)?( )*;/, points: 2 },
// echo 'something';
{ pattern: /echo( )+('|").+('|")( )*;/, points: 1 },
// NULL constant
{ pattern: /NULL/, points: 1 },
// new keyword
{ pattern: /new( )+((\\\w+)+|\w+)(\(.*\))?/, points: 1 },
// Function definition
{ pattern: /function(( )+[\$\w]+\(.*\)|( )*\(.*\))/g, points: 1 },
// (else)if statement
{ pattern: /(else)?if( )+\(.+\)/, points: 1 },
// scope operator
{ pattern: /\w+::\w+/, points: 1 },
// === operator
{ pattern: /===/g, points: 1 },
// !== operator
{ pattern: /!==/g, points: 1 },
// C/JS style variable declaration.
{ pattern: /(^|\s)(var|char|long|int|float|double)( )+\w+( )*=?/, points: -1 },
],
'Unknown': [],
};
function getPoints(language, lineOfCode, checkers) {
return _.reduce(_.map(checkers, function(checker) {
if (checker.pattern.test(lineOfCode)) {
return checker.points;
}
return 0;
}), function(memo, num) {
return memo + num;
}, 0);
}
function detectLang(snippet, options) {
var opts = _.defaults(options || {}, {
heuristic: true,
statistics: false,
});
var linesOfCode = snippet
.replace(/\r\n?/g, '\n')
.replace(/\n{2,}/g, '\n')
.split('\n');
function nearTop(index) {
if (linesOfCode.length <= 10) {
return true;
}
return index < linesOfCode.length / 10;
}
if (opts.heuristic && linesOfCode.length > 500) {
linesOfCode = linesOfCode.filter(function(lineOfCode, index) {
if (nearTop(index)) {
return true;
}
return index % Math.ceil(linesOfCode.length / 500) === 0;
});
}
var pairs = _.keys(languages).map(function(key) {
return { language: key, checkers: languages[key] };
});
var results = _.map(pairs, function(pairs) {
var language = pairs.language;
var checkers = pairs.checkers;
if (language === 'Unknown') {
return { language: 'Unknown', points: 1 };
}
var pointsList = linesOfCode.map(function(lineOfCode, index) {
if (!nearTop(index)) {
return getPoints(language, lineOfCode, _.reject(checkers, function(checker) {
return checker.nearTop;
}));
} else {
return getPoints(language, lineOfCode, checkers);
}
});
var points = _.reduce(pointsList, function(memo, num) {
return memo + num;
});
return { language: language, points: points };
});
var bestResult = _.max(results, function(result) {
return result.points;
});
if (opts.statistics) {
var statistics = {};
for (var result of results) {
statistics[result.language] = result.points;
}
return { detected: bestResult.language, statistics: statistics };
}
return bestResult.language;
}
module.exports = detectLang;