avast · regeciovad · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024
diff --git a/genrex/classes.py b/genrex/classes.py
@@ -52,7 +52,8 @@
     "genrexsimpleguid",
     "genrexthex",
     "genrexhex8",
-    "genrexexem",
+    "genrexurl",
+    "genrexphone",
 ]
 
 

diff --git a/genrex/clustering.py b/genrex/clustering.py
@@ -15,7 +15,6 @@
 from dataclasses import dataclass, field
 from statistics import mean, median
 
-from genrex.enums import InputType
 from genrex.logging import logger
 from genrex.misc import filter_ngrams, ready_to_print, string2ngrams
 
@@ -105,6 +104,7 @@ def __init__(self, store_original_strings: bool = False):
         self.unique_ngrams: dict = defaultdict(dict)
         self.index: dict = defaultdict(dict)
         self.min_ngram = 4
+        self.max_ngram = 270
         self.input_type: str = ""
 
     def add_resource(
@@ -120,9 +120,13 @@ def add_resource(
         self.input_type = input_type
         for source, strings in data.items():
             for string in strings:
-                if len(string) < self.min_ngram:
+                check_len = len(string)
+                if check_len < self.min_ngram:
                     logger.warning(f"String {string} is too short")
                     continue
+                elif check_len > self.max_ngram:
+                    logger.warning(f"String {string} is too long")
+                    continue
 
                 prep_string = string
 
@@ -137,14 +141,14 @@ def add_resource(
 
                 self.add(string, prep_string, source)
 
-        self.extract_ngrams(self.input_type)
+        self.extract_ngrams()
 
     def filter_short_strings(self):
         self.samples = dict(
             filter(lambda x: (len(x[0]) >= self.ngrams), self.samples.items())
         )
 
-    def extract_ngrams(self, input_type):
+    def extract_ngrams(self):
         splited_list = {}
         for prep_string in self.samples:
             self.unique_ngrams[prep_string] = []
@@ -166,11 +170,8 @@ def extract_ngrams(self, input_type):
             return
 
         self.ngrams = int((median(splited) + mean(splited)) // 2)
-        if input_type not in [
-            InputType.FILE_ACCESS,
-            InputType.KEY_ACCESS,
-        ]:
-            self.ngrams = self.ngrams // 2
+
+        self.ngrams = self.ngrams // 2
 
         self.ngrams = max(self.ngrams, self.min_ngram)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,7 +64,8 @@ disable = [
     "too-many-nested-blocks",
     "too-many-branches",
     "too-many-locals",
-    "too-many-function-args"
+    "too-many-function-args",
+    "too-many-instance-attributes"
 ]
 
 [build-system]