From ffc7db44ab675f2c97046f1440a1554eba7366fb Mon Sep 17 00:00:00 2001 From: Issy Long Date: Wed, 1 May 2024 14:04:37 +0100 Subject: [PATCH 1/2] Allow configuring a repository's `max_tree_size` value - We want to make Linguist better at detecting all possible languages in very large repos. - This makes a repository's `max_tree_size` value configurable, with the default still set at 100,000. --- lib/linguist/repository.rb | 11 ++++++----- test/test_repository.rb | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 39eead6a34..e6d957144d 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -12,8 +12,8 @@ class Repository # Public: Create a new Repository based on the stats of # an existing one - def self.incremental(repo, commit_oid, old_commit_oid, old_stats) - repo = self.new(repo, commit_oid) + def self.incremental(repo, commit_oid, old_commit_oid, old_stats, max_tree_size = 100_000) + repo = self.new(repo, commit_oid, max_tree_size) repo.load_existing_stats(old_commit_oid, old_stats) repo end @@ -24,11 +24,13 @@ def self.incremental(repo, commit_oid, old_commit_oid, old_stats) # repo - a Rugged::Repository object # commit_oid - the sha1 of the commit that will be analyzed; # this is usually the master branch + # max_tree_size - the maximum tree size to consider for analysis (default: 100,000) # # Returns a Repository - def initialize(repo, commit_oid) + def initialize(repo, commit_oid, max_tree_size = 100_000) @repository = repo @commit_oid = commit_oid + @max_tree_size = max_tree_size @old_commit_oid = nil @old_stats = nil @@ -129,10 +131,9 @@ def current_tree end protected - MAX_TREE_SIZE = 100_000 def compute_stats(old_commit_oid, cache = nil) - return {} if current_tree.count_recursive(MAX_TREE_SIZE) >= MAX_TREE_SIZE + return {} if current_tree.count_recursive(@max_tree_size) >= @max_tree_size old_tree = old_commit_oid && Rugged::Commit.lookup(repository, old_commit_oid).tree read_index diff --git a/test/test_repository.rb b/test/test_repository.rb index 9a6e503980..2dcfdfce01 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -87,8 +87,8 @@ def test_commit_with_git_attributes_data # With some .gitattributes data attr_commit = '7ee006cbcb2d7261f9e648510a684ee9ac64126b' - # It's incremental but should bust the cache - new_repo = Linguist::Repository.incremental(rugged_repository, attr_commit, old_commit, old_repo.cache) + # It's incremental but now is scanning more data and should bust the cache + new_repo = Linguist::Repository.incremental(rugged_repository, attr_commit, old_commit, old_repo.cache, 350_000) assert new_repo.breakdown_by_file["Java"].include?("lib/linguist.rb") end From daa9fb05802d0e1ca3e75f1db2d869a495e1f906 Mon Sep 17 00:00:00 2001 From: Issy Long Date: Wed, 1 May 2024 18:23:59 +0100 Subject: [PATCH 2/2] Use `MAX_TREE_SIZE` constant as default value - So that we only have to update the `100_000` tree size value once if we do need to change it. --- lib/linguist/repository.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index e6d957144d..6ef697f07b 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -10,9 +10,11 @@ module Linguist class Repository attr_reader :repository + MAX_TREE_SIZE = 100_000 + # Public: Create a new Repository based on the stats of # an existing one - def self.incremental(repo, commit_oid, old_commit_oid, old_stats, max_tree_size = 100_000) + def self.incremental(repo, commit_oid, old_commit_oid, old_stats, max_tree_size = MAX_TREE_SIZE) repo = self.new(repo, commit_oid, max_tree_size) repo.load_existing_stats(old_commit_oid, old_stats) repo @@ -24,10 +26,10 @@ def self.incremental(repo, commit_oid, old_commit_oid, old_stats, max_tree_size # repo - a Rugged::Repository object # commit_oid - the sha1 of the commit that will be analyzed; # this is usually the master branch - # max_tree_size - the maximum tree size to consider for analysis (default: 100,000) + # max_tree_size - the maximum tree size to consider for analysis (default: MAX_TREE_SIZE) # # Returns a Repository - def initialize(repo, commit_oid, max_tree_size = 100_000) + def initialize(repo, commit_oid, max_tree_size = MAX_TREE_SIZE) @repository = repo @commit_oid = commit_oid @max_tree_size = max_tree_size @@ -131,7 +133,6 @@ def current_tree end protected - def compute_stats(old_commit_oid, cache = nil) return {} if current_tree.count_recursive(@max_tree_size) >= @max_tree_size