From 6f116b89e003a61e54ff42a42c76088351a727a2 Mon Sep 17 00:00:00 2001 From: "Santiago M. Mola" Date: Mon, 2 Apr 2018 14:21:11 +0200 Subject: [PATCH 1/3] Specify disambiguation rules in YAML --- lib/linguist/heuristics.rb | 548 ++++-------------------------------- lib/linguist/heuristics.yml | 394 ++++++++++++++++++++++++++ test/test_heuristics.rb | 11 +- test/test_pedantic.rb | 7 +- 4 files changed, 464 insertions(+), 496 deletions(-) create mode 100644 lib/linguist/heuristics.yml diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index b642398114..867d911ffc 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -1,3 +1,5 @@ +require 'yaml' + module Linguist # A collection of simple heuristics that can be used to better analyze languages. class Heuristics @@ -17,6 +19,7 @@ class Heuristics # Returns an Array of languages, or empty if none matched or were inconclusive. def self.call(blob, candidates) return [] if blob.symlink? + self.load() data = blob.data[0...HEURISTICS_CONSIDER_BYTES] @@ -29,33 +32,52 @@ def self.call(blob, candidates) [] # No heuristics matched end - # Internal: Define a new heuristic. - # - # exts_and_langs - String names of file extensions and languages to - # disambiguate. - # heuristic - Block which takes data as an argument and returns a Language or nil. - # - # Examples - # - # disambiguate ".pm" do |data| - # if data.include?("use strict") - # Language["Perl"] - # elsif /^[^#]+:-/.match(data) - # Language["Prolog"] - # end - # end + # Internal: Load heuristics from 'heuristics.yml'. + def self.load() + if @heuristics.any? + return + end + + data = YAML.load_file(File.expand_path("../heuristics.yml", __FILE__)) + named_patterns = data['named_patterns'].map { |k,v| [k, self.to_regex(v)] }.to_h + + data['disambiguations'].each do |disambiguation| + exts = disambiguation['extensions'] + rules = disambiguation['rules'] + rules.map! do |rule| + if !rule['pattern'].nil? + rule['pattern'] = self.to_regex(rule['pattern']) + elsif !rule['negative_pattern'].nil? + pat = self.to_regex(rule['negative_pattern']) + rule['pattern'] = NegativePattern.new(pat) + elsif !rule['named_pattern'].nil? + rule['pattern'] = named_patterns[rule['named_pattern']] + end + rule + end + @heuristics << new(exts, rules) + end + end + + # Internal: Converts a string or array of strings to regexp # - def self.disambiguate(*exts_and_langs, &heuristic) - @heuristics << new(exts_and_langs, &heuristic) + # str: string or array of strings. If it is an array of strings, + # Regexp.union will be used. + def self.to_regex(str) + if str.kind_of?(Array) + Regexp.union(str.map { |s| Regexp.new(s) }) + else + Regexp.new(str) + end end # Internal: Array of defined heuristics @heuristics = [] # Internal - def initialize(exts_and_langs, &heuristic) + def initialize(exts_and_langs, rules) @exts_and_langs, @candidates = exts_and_langs.partition {|e| e =~ /\A\./} - @heuristic = heuristic + @rules = rules end # Internal: Check if this heuristic matches the candidate filenames or @@ -71,485 +93,31 @@ def matches?(filename, candidates) # Internal: Perform the heuristic def call(data) - @heuristic.call(data) - end - - # Common heuristics - CPlusPlusRegex = Regexp.union( - /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/, - /^\s*template\s*])/ - Perl5Regex = /\buse\s+(?:strict\b|v?5\.)/ - Perl6Regex = /^\s*(?:use\s+v6\b|\bmodule\b|\b(?:my\s+)?class\b)/ - - disambiguate ".as" do |data| - if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data) - Language["ActionScript"] - else - Language["AngelScript"] - end - end - - disambiguate ".asc" do |data| - if /^(----[- ]BEGIN|ssh-(rsa|dss)) /.match(data) - Language["Public Key"] - elsif /^[=-]+(\s|\n)|{{[A-Za-z]/.match(data) - Language["AsciiDoc"] - elsif /^(\/\/.+|((import|export)\s+)?(function|int|float|char)\s+((room|repeatedly|on|game)_)?([A-Za-z]+[A-Za-z_0-9]+)\s*[;\(])/.match(data) - Language["AGS Script"] - end - end - - disambiguate ".bb" do |data| - if /^\s*; /.match(data) || data.include?("End Function") - Language["BlitzBasic"] - elsif /^\s*(# |include|require)\b/.match(data) - Language["BitBake"] - end - end - - disambiguate ".builds" do |data| - if /^(\s*)()/.match(data) - Language["Erlang"] - elsif /(?:\/\/|("|')use strict\1|export\s+default\s|\/\*.*?\*\/)/m.match(data) - Language["JavaScript"] - end - end - - fortran_rx = /^([c*][^abd-z]| (subroutine|program|end|data)\s|\s*!)/i - - disambiguate ".f" do |data| - if /^: /.match(data) - Language["Forth"] - elsif data.include?("flowop") - Language["Filebench WML"] - elsif fortran_rx.match(data) - Language["Fortran"] - end - end - - disambiguate ".for" do |data| - if /^: /.match(data) - Language["Forth"] - elsif fortran_rx.match(data) - Language["Fortran"] - end - end - - disambiguate ".fr" do |data| - if /^(: |also |new-device|previous )/.match(data) - Language["Forth"] - elsif /^\s*(import|module|package|data|type) /.match(data) - Language["Frege"] - else - Language["Text"] - end - end - - disambiguate ".fs" do |data| - if /^(: |new-device)/.match(data) - Language["Forth"] - elsif /^\s*(#light|import|let|module|namespace|open|type)/.match(data) - Language["F#"] - elsif /^\s*(#version|precision|uniform|varying|vec[234])/.match(data) - Language["GLSL"] - elsif /#include|#pragma\s+(rs|version)|__attribute__/.match(data) - Language["Filterscript"] - end - end - - disambiguate ".gml" do |data| - if /^\s*(\<\?xml|xmlns)/i.match(data) - Language["XML"] - elsif /^\s*(graph|node)\s+\[$/i.match(data) - Language["Graph Modeling Language"] - else - Language["Game Maker Language"] - end - end - - disambiguate ".gs" do |data| - Language["Gosu"] if /^uses java\./.match(data) - end - - disambiguate ".h" do |data| - if ObjectiveCRegex.match(data) - Language["Objective-C"] - elsif CPlusPlusRegex.match(data) - Language["C++"] - end - end - - disambiguate ".inc" do |data| - if /^<\?(?:php)?/.match(data) - Language["PHP"] - elsif /^\s*#(declare|local|macro|while)\s/.match(data) - Language["POV-Ray SDL"] - end - end - - disambiguate ".l" do |data| - if /\(def(un|macro)\s/.match(data) - Language["Common Lisp"] - elsif /^(%[%{}]xs|<.*>)/.match(data) - Language["Lex"] - elsif /^\.[a-z][a-z](\s|$)/i.match(data) - Language["Roff"] - elsif /^\((de|class|rel|code|data|must)\s/.match(data) - Language["PicoLisp"] - end - end - - disambiguate ".ls" do |data| - if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data) - Language["LoomScript"] - else - Language["LiveScript"] - end - end - - disambiguate ".lsp", ".lisp" do |data| - if /^\s*\((defun|in-package|defpackage) /i.match(data) - Language["Common Lisp"] - elsif /^\s*\(define /.match(data) - Language["NewLisp"] - end - end - - disambiguate ".m" do |data| - if ObjectiveCRegex.match(data) - Language["Objective-C"] - elsif data.include?(":- module") - Language["Mercury"] - elsif /^: /.match(data) - Language["MUF"] - elsif /^\s*;/.match(data) - Language["M"] - elsif /\*\)$/.match(data) - Language["Mathematica"] - elsif /^\s*%/.match(data) - Language["Matlab"] - elsif /^\w+\s*:\s*module\s*{/.match(data) - Language["Limbo"] - end - end - - disambiguate ".md" do |data| - if /(^[-a-z0-9=#!\*\[|>])|<\//i.match(data) || data.empty? - Language["Markdown"] - elsif /^(;;|\(define_)/.match(data) - Language["GCC Machine Description"] - else - Language["Markdown"] - end - end - - disambiguate ".ml" do |data| - if /(^\s*module)|let rec |match\s+(\S+\s)+with/.match(data) - Language["OCaml"] - elsif /=> |case\s+(\S+\s)+of/.match(data) - Language["Standard ML"] - end - end - - disambiguate ".mod" do |data| - if data.include?(')\s*(\d{2}:\d{2}:\d{2},\d{3})$/.match(data) - Language["SubRip Text"] - end + def match(input) + return !@pat.match(input) end - disambiguate ".t" do |data| - if Perl5Regex.match(data) - Language["Perl"] - elsif Perl6Regex.match(data) - Language["Perl 6"] - elsif /^\s*%[ \t]+|^\s*var\s+\w+(\s*:\s*\w+)?\s*:=\s*\w+/.match(data) - Language["Turing"] - end - end - - disambiguate ".toc" do |data| - if /^## |@no-lib-strip@/.match(data) - Language["World of Warcraft Addon Data"] - elsif /^\\(contentsline|defcounter|beamer|boolfalse)/.match(data) - Language["TeX"] - end - end - - disambiguate ".ts" do |data| - if / ")) - Language["GAP"] - # Heads up - we don't usually write heuristics like this (with no regex match) - else - Language["Scilab"] - end - end - - disambiguate ".tsx" do |data| - if /^\s*(import.+(from\s+|require\()['"]react|\/\/\/\s*)' + - language: JavaScript + pattern: '(?m:\/\/|("|'')use strict\1|export\s+default\s|\/\*.*?\*\/)' +- extensions: ['.f'] + rules: + - language: Forth + pattern: '^: ' + - language: Filebench WML + pattern: 'flowop' + - language: Fortran + named_pattern: fortran +- extensions: ['.for'] + rules: + - language: Forth + pattern: '^: ' + - language: Fortran + named_pattern: fortran +- extensions: ['.fr'] + rules: + - language: Forth + pattern: '^(: |also |new-device|previous )' + - language: Frege + pattern: '^\s*(import|module|package|data|type) ' + - language: Text +- extensions: ['.fs'] + rules: + - language: Forth + pattern: '^(: |new-device)' + - language: 'F#' + pattern: '^\s*(#light|import|let|module|namespace|open|type)' + - language: GLSL + pattern: '^\s*(#version|precision|uniform|varying|vec[234])' + - language: Filterscript + pattern: '#include|#pragma\s+(rs|version)|__attribute__' +- extensions: ['.gml'] + rules: + - language: XML + pattern: '(?i:^\s*(\<\?xml|xmlns))' + - language: Graph Modeling Language + pattern: '(?i:^\s*(graph|node)\s+\[$)' + - language: Game Maker Language +- extensions: ['.gs'] + rules: + - language: Gosu + pattern: '^uses java\.' +- extensions: ['.h'] + rules: + - language: Objective-C + named_pattern: objectivec + - language: C++ + named_pattern: cpp +- extensions: ['.inc'] + rules: + - language: PHP + pattern: '^<\?(?:php)?' + - language: POV-Ray SDL + pattern: '^\s*#(declare|local|macro|while)\s' +- extensions: ['.l'] + rules: + - language: Common Lisp + pattern: '\(def(un|macro)\s' + - language: Lex + pattern: '^(%[%{}]xs|<.*>)' + - language: Roff + pattern: '^\.[A-Za-z]{2}(\s|$)' + - language: PicoLisp + pattern: '^\((de|class|rel|code|data|must)\s' +- extensions: ['.ls'] + rules: + - language: LoomScript + pattern: '^\s*package\s*[\w\.\/\*\s]*\s*{' + - language: LiveScript +- extensions: ['.lsp', '.lisp'] + rules: + - language: Common Lisp + pattern: '^\s*\((?i:defun|in-package|defpackage) ' + - language: NewLisp + pattern: '^\s*\(define ' +- extensions: ['.m'] + rules: + - language: Objective-C + named_pattern: objectivec + - language: Mercury + pattern: ':- module' + - language: MUF + pattern: '^: ' + - language: M + pattern: '^\s*;' + - language: Mathematica + pattern: '\*\)$' + - language: Matlab + pattern: '^\s*%' + - language: Limbo + pattern: '^\w+\s*:\s*module\s*{' +- extensions: ['.md'] + rules: + - language: Markdown + pattern: + - '(^[-A-Za-z0-9=#!\*\[|>])|<\/' + - '\A\z' + - language: GCC Machine Description + pattern: '^(;;|\(define_)' + - language: Markdown +- extensions: ['.ml'] + rules: + - language: OCaml + pattern: '(^\s*module)|let rec |match\s+(\S+\s)+with' + - language: Standard ML + pattern: '=> |case\s+(\S+\s)+of' +- extensions: ['.mod'] + rules: + - language: XML + pattern: ')\s*(\d{2}:\d{2}:\d{2},\d{3})$' +- extensions: ['.t'] + rules: + - language: Perl + named_pattern: perl5 + - language: Perl 6 + named_pattern: perl6 + - language: Turing + pattern: '^\s*%[ \t]+|^\s*var\s+\w+(\s*:\s*\w+)?\s*:=\s*\w+' +- extensions: ['.toc'] + rules: + - language: World of Warcraft Addon Data + pattern: '^## |@no-lib-strip@' + - language: TeX + pattern: '^\\(contentsline|defcounter|beamer|boolfalse)' +- extensions: ['.ts'] + rules: + - language: XML + pattern: ' ' + # Heads up - we don't usually write heuristics like this (with no regex match) + - language: Scilab +- extensions: ['.tsx'] + rules: + - language: TypeScript + pattern: '^\s*(import.+(from\s+|require\()[''"]react|\/\/\/\s*' + - '^\s*template\s*<' + - '^[ \t]*try' + - '^[ \t]*catch\s*\(' + - '^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+' + - '^[ \t]*(private|public|protected):$' + - 'std::\w+' + fortran: '^(?i:[c*][^abd-z]| (subroutine|program|end|data)\s|\s*!)' + objectivec: '^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])' + perl5: '\buse\s+(?:strict\b|v?5\.)' + perl6: '^\s*(?:use\s+v6\b|\bmodule\b|\b(?:my\s+)?class\b)' \ No newline at end of file diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index 8a5ed82d2f..1df0289318 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -35,10 +35,13 @@ def assert_heuristics(hash) Array(blobs).each do |blob| result = Heuristics.call(file_blob(blob), candidates) if language.nil? - assert_equal [], result, "Failed for #{blob}" + expected = [] + elsif language.is_a?(Array) + expected = language.map{ |l| Language[l] } else - assert_equal [Language[language]], result, "Failed for #{blob}" + expected = [Language[language]] end + assert_equal expected, result, "Failed for #{blob}" end end end @@ -246,7 +249,9 @@ def test_ml_by_heuristics def test_mod_by_heuristics assert_heuristics({ "Modula-2" => all_fixtures("Modula-2", "*.mod"), - "XML" => all_fixtures("XML", "*.mod") + "XML" => all_fixtures("XML", "*.mod"), + ["Linux Kernel Module", "AMPL"] => all_fixtures("Linux Kernel Module", "*.mod"), + ["Linux Kernel Module", "AMPL"] => all_fixtures("AMPL", "*.mod"), }) end diff --git a/test/test_pedantic.rb b/test/test_pedantic.rb index 00c14f46d3..a8f6d3063b 100644 --- a/test/test_pedantic.rb +++ b/test/test_pedantic.rb @@ -4,6 +4,7 @@ class TestPedantic < Minitest::Test filename = File.expand_path("../../lib/linguist/languages.yml", __FILE__) LANGUAGES = YAML.load(File.read(filename)) GRAMMARS = YAML.load(File.read(File.expand_path("../../grammars.yml", __FILE__))) + HEURISTICS = YAML.load_file(File.expand_path("../../lib/linguist/heuristics.yml", __FILE__)) def test_language_names_are_sorted assert_sorted LANGUAGES.keys @@ -33,9 +34,9 @@ def test_scopes_are_sorted end def test_heuristics_are_sorted - file = File.expand_path("../../lib/linguist/heuristics.rb", __FILE__) - heuristics = open(file).each.grep(/^ *disambiguate/) - assert_sorted heuristics + disambiguations = HEURISTICS['disambiguations'] + assert_sorted disambiguations.map { |r| r['extensions'][0] } + assert_sorted HEURISTICS['named_patterns'].keys end def test_heuristics_tests_are_sorted From 3abe1267500df2591024f6792b1224478219148d Mon Sep 17 00:00:00 2001 From: "Santiago M. Mola" Date: Mon, 6 Aug 2018 10:28:46 +0200 Subject: [PATCH 2/3] heurisitcs: remove candidates logic --- lib/linguist/heuristics.rb | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 867d911ffc..876e974dc9 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -76,7 +76,7 @@ def self.to_regex(str) # Internal def initialize(exts_and_langs, rules) - @exts_and_langs, @candidates = exts_and_langs.partition {|e| e =~ /\A\./} + @exts_and_langs = exts_and_langs @rules = rules end @@ -85,10 +85,7 @@ def initialize(exts_and_langs, rules) def matches?(filename, candidates) filename = filename.downcase candidates = candidates.compact.map(&:name) - @exts_and_langs.any? { |ext| filename.end_with?(ext) } || - (candidates.any? && - (@candidates - candidates == [] && - candidates - @candidates == [])) + @exts_and_langs.any? { |ext| filename.end_with?(ext) } end # Internal: Perform the heuristic From 71ab4407158ae07a30e54c800e250811776307b0 Mon Sep 17 00:00:00 2001 From: "Santiago M. Mola" Date: Mon, 3 Sep 2018 10:52:15 +0200 Subject: [PATCH 3/3] heuristics: and matches --- lib/linguist/heuristics.rb | 46 +++++++++++++++++++++++++++++-------- lib/linguist/heuristics.yml | 12 ++++++---- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 876e974dc9..fe680382e5 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -45,20 +45,29 @@ def self.load() exts = disambiguation['extensions'] rules = disambiguation['rules'] rules.map! do |rule| - if !rule['pattern'].nil? - rule['pattern'] = self.to_regex(rule['pattern']) - elsif !rule['negative_pattern'].nil? - pat = self.to_regex(rule['negative_pattern']) - rule['pattern'] = NegativePattern.new(pat) - elsif !rule['named_pattern'].nil? - rule['pattern'] = named_patterns[rule['named_pattern']] - end + rule['pattern'] = self.parse_rule(named_patterns, rule) rule end @heuristics << new(exts, rules) end end + def self.parse_rule(named_patterns, rule) + if !rule['and'].nil? + rules = rule['and'].map { |block| self.parse_rule(named_patterns, block) } + return And.new(rules) + elsif !rule['pattern'].nil? + return self.to_regex(rule['pattern']) + elsif !rule['negative_pattern'].nil? + pat = self.to_regex(rule['negative_pattern']) + return NegativePattern.new(pat) + elsif !rule['named_pattern'].nil? + return named_patterns[rule['named_pattern']] + else + return AlwaysMatch.new() + end + end + # Internal: Converts a string or array of strings to regexp # # str: string or array of strings. If it is an array of strings, @@ -91,8 +100,7 @@ def matches?(filename, candidates) # Internal: Perform the heuristic def call(data) matched = @rules.find do |rule| - m = !rule.key?('pattern') || rule['pattern'].match(data) - m + rule['pattern'].match(data) end if !matched.nil? languages = matched['language'] @@ -106,6 +114,24 @@ def call(data) end + class And + + def initialize(pats) + @pats = pats + end + + def match(input) + return !@pats.any? { |pat| !pat.match(input) } + end + + end + + class AlwaysMatch + def match(input) + return true + end + end + class NegativePattern def initialize(pat) diff --git a/lib/linguist/heuristics.yml b/lib/linguist/heuristics.yml index 24b409f962..79736ab392 100644 --- a/lib/linguist/heuristics.yml +++ b/lib/linguist/heuristics.yml @@ -16,6 +16,8 @@ # Pattern can be a string with a single regular expression # or an array of strings that will be merged in a single # regular expression (with union). +# and - An and block merges multiple rules and checks that all of +# of them must match. # negative_pattern - Same as pattern, but checks for absence of matches. # named_pattern - A pattern can be reused by specifying it in the # named_patterns section and referencing it here by its @@ -217,9 +219,9 @@ disambiguations: - language: Roff pattern: '^[.''][A-Za-z]{2}(\s|$)' - language: Unix Assembly - pattern: - - '(?