Skip to content

Commit

Permalink
heuristics: refactoring, extracting rule package
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Bezzubov <bzz@apache.org>
  • Loading branch information
bzz committed Feb 5, 2019
1 parent c4f3dbe commit 97ab29a
Show file tree
Hide file tree
Showing 8 changed files with 1,076 additions and 1,051 deletions.
890 changes: 443 additions & 447 deletions data/content.go

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions data/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package data contains only auto-generated data-structures for all the language
// identification strategies from the Linguist project sources.
package data
115 changes: 15 additions & 100 deletions data/heuristics.go
Original file line number Diff line number Diff line change
@@ -1,44 +1,24 @@
package data

// Implmentation of a rule-based content heuristics matching engine.
// Every Rule defines a patterns that content must match in order to be identifed as
// belonging to a language(s).
// It is used to generate a content.go code for disambiguation of languages with
// colliding extensions based on regexps from Linguist.
import "gopkg.in/src-d/enry.v1/data/rule"

import "regexp"
// Heuristics implements a rule-based content matching engine.

type (
// Heuristics consists of a number of sequntially applied Matchers.
Heuristics []Matcher
// Heuristics is a number of sequntially applied rule.Heuristic where a
// matching one disambiguages language(s) for a single file extension.
type Heuristics []rule.Heuristic

// Matcher checks if a given data matches (number of) patterns.
Matcher interface {
Match(data []byte) bool
}

// Languages incapsulates data common to every Rule: number of languages
// it identifies.
Languages struct {
langs []string
}

// Rule interface provides access to a languages that this rule identifies.
Rule interface {
GetLanguages() []string
}
)

// Match returns languages identified by the matching rules of the heuristic.
func (h *Heuristics) Match(data []byte) []string {
// Match returns languages identified by the matching rule of the heuristic.
func (hs *Heuristics) Match(data []byte) []string {
var matchedLangs []string
for _, matcher := range *h {
if matcher.Match(data) {
for _, langOrAlias := range matcher.(Rule).GetLanguages() {
for _, heuristic := range *hs {
if heuristic.Match(data) {
for _, langOrAlias := range heuristic.Languages() {
lang, ok := LanguageByAlias(langOrAlias)
if !ok { // should never happen
// language name/alias in heuristics.yml is not consistent with languages.yml
// but we do not surface any error on the API
// reaching here means language name/alias in heuristics.yml
// is not consistent with languages.yml
// but we do not surface any such error at the API
continue
}
matchedLangs = append(matchedLangs, lang)
Expand All @@ -50,71 +30,6 @@ func (h *Heuristics) Match(data []byte) []string {
}

// matchString is a convenience used only in tests.
func (h *Heuristics) matchString(data string) []string {
return h.Match([]byte(data))
}

// GetLanguages returns languages, defined by this data.Rule.
func (l *Languages) GetLanguages() []string {
return l.langs
}

// OrRule matches if a single matching pattern exists.
// It defines only one pattern as it relis on compile-time optimization that
// represtes union with | in a single regexp pattern.
type OrRule struct {
*Languages
Pattern *regexp.Regexp
}

// Match implements data.Matcher.
func (r *OrRule) Match(data []byte) bool {
return r.Pattern.Match(data)
}

// AndRule matches if all of the patterns match.
type AndRule struct {
*Languages
Patterns []Matcher
}

// Match implements data.Matcher.
func (r *AndRule) Match(data []byte) bool {
allMatch := true
for _, p := range r.Patterns {
if !p.Match(data) {
allMatch = false
break
}
}
return allMatch
}

// NotRule matches if none of the patterns match.
type NotRule struct {
*Languages
Patterns []*regexp.Regexp
}

// Match implements data.Matcher.
func (r *NotRule) Match(data []byte) bool {
allDontMatch := true
for _, p := range r.Patterns {
if p.Match(data) {
allDontMatch = false
break
}
}
return allDontMatch
}

// AlwaysRule always matches.
// Used as default fallback.
type AlwaysRule struct {
*Languages
}

// Match implements data.Matcher.
func (r *AlwaysRule) Match(data []byte) bool {
return true
func (hs *Heuristics) matchString(data string) []string {
return hs.Match([]byte(data))
}
58 changes: 27 additions & 31 deletions data/heuristics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,61 +5,57 @@ import (
"testing"

"github.com/stretchr/testify/assert"
"gopkg.in/src-d/enry.v1/data/rule"
)

var testContentHeuristics = map[string]*Heuristics{
".md": &Heuristics{ // final pattern for parsed YAML rule
&OrRule{
&Languages{[]string{"Markdown"}},
rule.Or(
rule.MatchingLanguages("Markdown"),
regexp.MustCompile(`(^[-A-Za-z0-9=#!\*\[|>])|<\/ | \A\z`),
},
&OrRule{
&Languages{[]string{"GCC Machine Description"}},
),
rule.Or(
rule.MatchingLanguages("GCC Machine Description"),
regexp.MustCompile(`^(;;|\(define_)`),
},
&AlwaysRule{
&Languages{[]string{"Markdown"}},
},
),
rule.Always(
rule.MatchingLanguages("Markdown"),
),
},
".ms": &Heuristics{
// Order defines precedence: And, Or, Not, Named, Always
&AndRule{
&Languages{[]string{"Unix Assembly"}},
[]Matcher{
&NotRule{
nil,
[]*regexp.Regexp{regexp.MustCompile(`/\*`)},
},
&OrRule{
nil,
regexp.MustCompile(`^\s*\.(?:include\s|globa?l\s|[A-Za-z][_A-Za-z0-9]*:)`),
},
},
},
&OrRule{
&Languages{[]string{"Roff"}},
rule.And(
rule.MatchingLanguages("Unix Assembly"),
rule.Not(nil, regexp.MustCompile(`/\*`)),
rule.Or(
nil,
regexp.MustCompile(`^\s*\.(?:include\s|globa?l\s|[A-Za-z][_A-Za-z0-9]*:)`),
),
),
rule.Or(
rule.MatchingLanguages("Roff"),
regexp.MustCompile(`^[.''][A-Za-z]{2}(\s|$)`),
},
&AlwaysRule{
&Languages{[]string{"MAXScript"}},
},
),
rule.Always(
rule.MatchingLanguages("MAXScript"),
),
},
}

func TestContentHeuristics_MatchingAlways(t *testing.T) {
func TestContentHeuristic_MatchingAlways(t *testing.T) {
lang := testContentHeuristics[".md"].matchString("")
assert.Equal(t, []string{"Markdown"}, lang)

lang = testContentHeuristics[".ms"].matchString("")
assert.Equal(t, []string{"MAXScript"}, lang)
}

func TestContentHeuristics_MatchingAnd(t *testing.T) {
func TestContentHeuristic_MatchingAnd(t *testing.T) {
lang := testContentHeuristics[".md"].matchString(";;")
assert.Equal(t, []string{"GCC Machine Description"}, lang)
}

func TestContentHeuristics_MatchingOr(t *testing.T) {
func TestContentHeuristic_MatchingOr(t *testing.T) {
lang := testContentHeuristics[".ms"].matchString(" .include \"math.s\"")
assert.Equal(t, []string{"Unix Assembly"}, lang)
}
117 changes: 117 additions & 0 deletions data/rule/rule.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Package rule contains rule-based heuristic implementations.
// It is used in the generated code in content.go for disambiguation of languages
// with colliding extensions based on regexps from Linguist data.
package rule

import (
"regexp"
)

// Heuristic consist of a number of rules where each, if matches,
// identifes content as belonging to a programming language(s).
type Heuristic interface {
Matcher
Languages() []string
}

// Matcher checks if the data matches (number of) pattern.
// Every rule below implements this interface: a rule is matcher that identifies
// given programming language(s) in case of the match.
type Matcher interface {
Match(data []byte) bool
}

// languages struct incapsulate data common to every Matcher: all languages
// that it identifies.
type languages struct {
langs []string
}

// Languages returns all languages, identified by this Matcher.
func (l *languages) Languages() []string {
return l.langs
}

// MatchingLanguages is a helper to create new languages.
func MatchingLanguages(langs ...string) *languages {
return &languages{langs}
}

// Implements a Heuristic.
type or struct {
*languages
Pattern *regexp.Regexp
}

// Or rule matches, if a single matching pattern exists.
// It defines only one pattern as it relies on compile-time optimization that
// represtes union with | in a single regexp.
func Or(l *languages, r *regexp.Regexp) *or {
return &or{l, r}
}

// Match implements rule.Matcher.
func (r *or) Match(data []byte) bool {
return r.Pattern.Match(data)
}

// Implements a Heuristic.
type and struct {
*languages
Patterns []Matcher
}

// And rule matches, if each of the patterns does match.
func And(l *languages, m ...Matcher) *and {
return &and{l, m}
}

// Match implements data.Matcher.
func (r *and) Match(data []byte) bool {
allMatch := true
for _, p := range r.Patterns {
if !p.Match(data) {
allMatch = false
break
}
}
return allMatch
}

// Implements a Heuristic.
type not struct {
*languages
Patterns []*regexp.Regexp
}

// Not rule matches if none of the patterns match.
func Not(l *languages, r ...*regexp.Regexp) *not {
return &not{l, r}
}

// Match implements data.Matcher.
func (r *not) Match(data []byte) bool {
allDontMatch := true
for _, p := range r.Patterns {
if p.Match(data) {
allDontMatch = false
break
}
}
return allDontMatch
}

// Implements a Heuristic.
type always struct {
*languages
}

// Always rule always matches. Often is used as a default fallback.
func Always(l *languages) *always {
return &always{l}
}

// Match implements Matcher.
func (r *always) Match(data []byte) bool {
return true
}
Loading

0 comments on commit 97ab29a

Please sign in to comment.