Skip to content

Commit

Permalink
Merge different languages for language stats (go-gitea#24900) (go-git…
Browse files Browse the repository at this point in the history
…ea#24921)

Backport go-gitea#24900

Fix go-gitea#24896

If users set different languages by `linguist-language`, the `stats` map
could be: `java: 100, Java: 200`.

Language stats are stored as case-insensitive in database and there is a
unique key.

So, the different language names should be merged to one unique name:
`Java: 300`

(cherry picked from commit a83d597)
  • Loading branch information
wxiaoguang authored and earl-warren committed May 30, 2023
1 parent 5fdac0d commit 6d3b3af
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 5 deletions.
39 changes: 39 additions & 0 deletions modules/git/repo_language_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,46 @@

package git

import (
"strings"
"unicode"
)

const (
fileSizeLimit int64 = 16 * 1024 // 16 KiB
bigFileSize int64 = 1024 * 1024 // 1 MiB
)

// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
names := map[string]struct {
uniqueName string
upperCount int
}{}

countUpper := func(s string) (count int) {
for _, r := range s {
if unicode.IsUpper(r) {
count++
}
}
return count
}

for name := range stats {
cnt := countUpper(name)
lower := strings.ToLower(name)
if cnt >= names[lower].upperCount {
names[lower] = struct {
uniqueName string
upperCount int
}{uniqueName: name, upperCount: cnt}
}
}

res := make(map[string]int64, len(names))
for name, num := range stats {
res[names[strings.ToLower(name)].uniqueName] += num
}
return res
}
2 changes: 1 addition & 1 deletion modules/git/repo_language_stats_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
}

return sizes, nil
return mergeLanguageStats(sizes), nil
}

func readFile(f *object.File, limit int64) ([]byte, error) {
Expand Down
8 changes: 4 additions & 4 deletions modules/git/repo_language_stats_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)
if language == enry.OtherLanguage || language == "" {
if language == "" {
continue
}

Expand All @@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err

included, checked := includedLanguage[language]
if !checked {
langtype := enry.GetLanguageType(language)
included = langtype == enry.Programming || langtype == enry.Markup
langType := enry.GetLanguageType(language)
included = langType == enry.Programming || langType == enry.Markup
includedLanguage[language] = included
}
if included {
Expand All @@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
}

return sizes, nil
return mergeLanguageStats(sizes), nil
}

func discardFull(rd *bufio.Reader, discard int64) error {
Expand Down
14 changes: 14 additions & 0 deletions modules/git/repo_language_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
"Java": 112,
}, stats)
}

func TestMergeLanguageStats(t *testing.T) {
assert.EqualValues(t, map[string]int64{
"PHP": 1,
"python": 10,
"JAVA": 700,
}, mergeLanguageStats(map[string]int64{
"PHP": 1,
"python": 10,
"Java": 100,
"java": 200,
"JAVA": 400,
}))
}

0 comments on commit 6d3b3af

Please sign in to comment.