Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add detected file language to code search #10256

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/content/doc/advanced/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ relation to port exhaustion.
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
- `REPO_INDEXER_INCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **include** in the index. Use `**.txt` to match any files with .txt extension. An empty list means include all files.
- `REPO_INDEXER_EXCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **exclude** from the index. Files that match this list will not be indexed, even if they match in `REPO_INDEXER_INCLUDE`.
- `REPO_INDEXER_EXCLUDE_VENDORED`: **true**: Exclude vendored files from index.
- `UPDATE_BUFFER_LEN`: **20**: Buffer length of index request.
- `MAX_FILE_SIZE`: **1048576**: Maximum size in bytes of files to be indexed.
- `STARTUP_TIMEOUT`: **30s**: If the indexer takes longer than this timeout to start - fail. (This timeout will be added to the hammer time above for child processes - as bleve will not start until the previous parent is shutdown.) Set to zero to never timeout.
Expand Down
2 changes: 2 additions & 0 deletions docs/content/doc/advanced/repo-indexer.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ Gitea applies glob pattern matching from the [`gobwas/glob` library](https://git

Limiting the list of files prevents the indexes from becoming polluted with derived or irrelevant files (e.g. lss, sym, map, etc.), so the search results are more relevant. It can also help reduce the index size.

`REPO_INDEXER_EXCLUDE_VENDORED` (default: true) excludes vendored files from index.

`REPO_INDEXER_INCLUDE` (default: empty) is a comma separated list of glob patterns to **include** in the index. An empty list means "_include all files_".
`REPO_INDEXER_EXCLUDE` (default: empty) is a comma separated list of glob patterns to **exclude** from the index. Files that match this list will not be indexed. `REPO_INDEXER_EXCLUDE` takes precedence over `REPO_INDEXER_INCLUDE`.

Expand Down
36 changes: 36 additions & 0 deletions modules/analyze/code_langauge.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package analyze

import (
"path/filepath"

"github.com/src-d/enry/v2"
)

// GetCodeLanguageWithCallback detects code language based on file name and content using callback
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string {
if language, ok := enry.GetLanguageByExtension(filename); ok {
return language
}

if language, ok := enry.GetLanguageByFilename(filename); ok {
return language
}

content, err := contentFunc()
if err != nil {
return enry.OtherLanguage
}

return enry.GetLanguage(filepath.Base(filename), content)
}

// GetCodeLanguage detects code language based on file name and content
func GetCodeLanguage(filename string, content []byte) string {
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) {
return content, nil
})
}
27 changes: 9 additions & 18 deletions modules/git/repo_language_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ import (
"io"
"io/ioutil"
"math"
"path/filepath"

"code.gitea.io/gitea/modules/analyze"

"github.com/src-d/enry/v2"
"gopkg.in/src-d/go-git.v4"
Expand Down Expand Up @@ -51,25 +52,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e

// TODO: Use .gitattributes file for linguist overrides

language, ok := enry.GetLanguageByExtension(f.Name)
if !ok {
if language, ok = enry.GetLanguageByFilename(f.Name); !ok {
content, err := readFile(f, fileSizeLimit)
if err != nil {
return nil
}

language = enry.GetLanguage(filepath.Base(f.Name), content)
if language == enry.OtherLanguage {
return nil
}
}
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) {
return readFile(f, fileSizeLimit)
})
if language == enry.OtherLanguage || language == "" {
return nil
}

if language != "" {
sizes[language] += f.Size
total += f.Size
}
sizes[language] += f.Size
total += f.Size

return nil
})
Expand Down
114 changes: 96 additions & 18 deletions modules/indexer/code/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,28 @@ import (
"os"
"strconv"
"strings"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"

"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/analysis/analyzer/custom"
analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
"github.com/src-d/enry/v2"
)

const unicodeNormalizeName = "unicodeNormalize"
Expand Down Expand Up @@ -86,16 +91,23 @@ func openIndexer(path string, latestVersion int) (bleve.Index, error) {

// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
Content string
RepoID int64
CommitID string
Content string
Language string
UpdatedAt time.Time
}

// Type returns the document type, for bleve's mapping.Classifier interface.
func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}

func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
lafriks marked this conversation as resolved.
Show resolved Hide resolved
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
Expand All @@ -118,8 +130,11 @@ func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.Flushin

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
Content: string(charset.ToUTF8DropErrors(fileContents)),
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

Expand All @@ -131,7 +146,7 @@ func addDelete(filename string, repo *models.Repository, batch rupture.FlushingB
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 4
repoIndexerLatestVersion = 5
)

// createRepoIndexer create a repo indexer if one does not already exist
Expand All @@ -145,11 +160,21 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)

termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)

timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)

mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
"type": custom.Name,
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
Expand Down Expand Up @@ -255,7 +280,7 @@ func (b *BleveIndexer) Index(repoID int64) error {

batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(update, repo, batch); err != nil {
if err := addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
Expand Down Expand Up @@ -289,7 +314,7 @@ func (b *BleveIndexer) Delete(repoID int64) error {

// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
Expand All @@ -309,16 +334,35 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
indexerQuery = phraseQuery
}

// Save for reuse without language filter
facetQuery := indexerQuery
if len(language) > 0 {
languageQuery := bleve.NewMatchQuery(language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name

indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
)
}

from := (page - 1) * pageSize
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID"}
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true

if len(language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}

result, err := b.indexer.Search(searchRequest)
if err != nil {
return 0, nil, err
return 0, nil, nil, err
}

total := int64(result.Total)

searchResults := make([]*SearchResult, len(result.Hits))
for i, hit := range result.Hits {
var startIndex, endIndex int = -1, -1
Expand All @@ -333,13 +377,47 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
endIndex = locationEnd
}
}
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
updatedUnix = timeutil.TimeStamp(t.Unix())
}
searchResults[i] = &SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: filenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: filenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
lafriks marked this conversation as resolved.
Show resolved Hide resolved
UpdatedUnix: updatedUnix,
Language: language,
Color: enry.GetColor(language),
}
}

searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
if len(language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))

if result, err = b.indexer.Search(facetRequest); err != nil {
return 0, nil, nil, err
}

}
languagesFacet := result.Facets["languages"]
for _, term := range languagesFacet.Terms {
if len(term.Term) == 0 {
continue
}
searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
lafriks marked this conversation as resolved.
Show resolved Hide resolved
Language: term.Term,
Color: enry.GetColor(term.Term),
Count: term.Count,
})
}
return int64(result.Total), searchResults, nil
return total, searchResults, searchResultLanguages, nil
}
9 changes: 8 additions & 1 deletion modules/indexer/code/bleve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,27 +49,34 @@ func TestIndexAndSearch(t *testing.T) {
keywords = []struct {
Keyword string
IDs []int64
Langs int
}{
{
Keyword: "Description",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "repo1",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "non-exist",
IDs: []int64{},
Langs: 0,
},
}
)

for _, kw := range keywords {
total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)

assert.NotNil(t, langs)
assert.Len(t, langs, kw.Langs)

var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
Expand Down
24 changes: 18 additions & 6 deletions modules/indexer/code/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,34 @@ import (
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
)

// SearchResult result of performing a search in a repo
type SearchResult struct {
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
}

// SearchResultLanguages result of top languages count in search results
type SearchResultLanguages struct {
Language string
Color string
Count int
}

// Indexer defines an interface to indexer issues contents
type Indexer interface {
Index(repoID int64) error
Delete(repoID int64) error
Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error)
Close()
}

Expand Down
Loading