mandiant · mr-tz · Feb 1, 2023 · Jan 30, 2023 · Jan 30, 2023 · williballenthin
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -98,6 +98,8 @@
 - handle vivisect bug around strings at instruction level, use min length 4 #1271 @williballenthin @mr-tz
 - extractor: guard against invalid "calls from" features #1177 @mr-tz
 - extractor: add format to global features #1258 @mr-tz
+- extractor: discover all strings with length >= 4 #1280 @mr-tz
+- extractor: don't extract byte features for strings #1293 @mr-tz
 
 ### capa explorer IDA Pro plugin
 - fix: display instruction items #1154 @mr-tz

diff --git a/capa/features/extractors/dnfile/insn.py b/capa/features/extractors/dnfile/insn.py
@@ -191,7 +191,8 @@ def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iter
     if user_string is None:
         return
 
-    yield String(user_string), ih.address
+    if len(user_string) >= 4:
+        yield String(user_string), ih.address
 
 
 def extract_unmanaged_call_characteristic_features(

diff --git a/capa/features/extractors/ida/helpers.py b/capa/features/extractors/ida/helpers.py
@@ -197,7 +197,7 @@ def read_bytes_at(ea: int, count: int) -> bytes:
 def find_string_at(ea: int, min_: int = 4) -> str:
     """check if ASCII string exists at a given virtual address"""
     found = idaapi.get_strlit_contents(ea, -1, idaapi.STRTYPE_C)
-    if found and len(found) > min_:
+    if found and len(found) >= min_:
         try:
             found = found.decode("ascii")
             # hacky check for IDA bug; get_strlit_contents also reads Unicode as

diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py
@@ -172,7 +172,9 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     if ref != insn.ea:
         extracted_bytes = capa.features.extractors.ida.helpers.read_bytes_at(ref, MAX_BYTES_FEATURE_SIZE)
         if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes):
-            yield Bytes(extracted_bytes), ih.address
+            if not capa.features.extractors.ida.helpers.find_string_at(insn.ea):
+                # don't extract byte features for obvious strings
+                yield Bytes(extracted_bytes), ih.address
 
 
 def extract_insn_string_features(

diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py
@@ -271,6 +271,10 @@ def extract_insn_bytes_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Itera
             if capa.features.extractors.helpers.all_zeros(buf):
                 continue
 
+            if f.vw.isProbablyString(v):
+                # don't extract byte features for obvious strings
+                continue
+
             yield Bytes(buf), ih.address
 
 
@@ -676,7 +680,7 @@ def extract_op_string_features(
         except ValueError:
             continue
         else:
-            if len(s) > 4:
+            if len(s) >= 4:
                 yield String(s), ih.address