Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

markup for %3D separator #175

Merged
merged 13 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
META MD5 414228344bac7e55c5127be7b244e460
DATA MD5 abd9c025d5c323af814fbeb33f469c90
DATA: 16342283 interested lines. MARKUP: 62020 items
META MD5 5bb0a05fd77c2761b8414bba41103939
DATA MD5 9e77a2d9f718f175264ab5a386ae86c4
DATA: 16342283 interested lines. MARKUP: 62022 items
FileType FileNumber ValidLines Positives Negatives Templates
--------------- ------------ ------------ ----------- ----------- -----------
194 28318 71 418 90
Expand Down Expand Up @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 134 5
.j 1 241 4
.j2 30 5530 6 186 10
.java 621 134132 362 1365 171
.java 621 134132 368 1365 171
.jenkinsfile 1 58 2 6
.jinja2 1 64 2
.js 659 536413 531 2497 331
Expand Down Expand Up @@ -222,7 +222,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.yml 419 36169 559 889 376
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10232 16342283 12255 49692 5101
TOTAL: 10232 16342283 12261 49692 5101
credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ----
Expand All @@ -231,7 +231,7 @@ AWS Client ID 168 21 0
AWS Multi 82 10 0 0 0 10 82 0.000000 1.000000 0.108696 0.000000
AWS S3 Bucket 67 23 0 0 0 23 67 0.000000 1.000000 0.255556 0.000000
Atlassian Old PAT token 27 308 3 0 0 311 27 0.000000 1.000000 0.920118 0.000000
Auth 414 2739 82 0 0 2821 414 0.000000 1.000000 0.872025 0.000000
Auth 417 2739 82 0 0 2821 417 0.000000 1.000000 0.871217 0.000000
Azure Access Token 19 0 0 0 0 0 19 1.000000 0.000000 0.000000
BASE64 Private Key 7 4 0 0 0 4 7 0.000000 1.000000 0.363636 0.000000
BASE64 encoded PEM Private Key 7 0 0 0 0 0 7 1.000000 0.000000 0.000000
Expand All @@ -258,7 +258,7 @@ JSON Web Token 170 61 0
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 15 6 1 0 0 7 15 0.000000 1.000000 0.318182 0.000000
Key 3909 15717 485 0 0 16202 3909 0.000000 1.000000 0.805629 0.000000
Nonce 91 49 0 0 0 49 91 0.000000 1.000000 0.350000 0.000000
Nonce 93 49 0 0 0 49 93 0.000000 1.000000 0.345070 0.000000
Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000
PEM Private Key 1019 1483 0 0 0 1483 1019 0.000000 1.000000 0.592726 0.000000
Password 1869 7536 2680 0 0 10216 1869 0.000000 1.000000 0.845345 0.000000
Expand All @@ -267,8 +267,8 @@ Secret 1297 1576 802
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 0 0 1 4 0.000000 1.000000 0.200000 0.000000
Tencent WeChat API App ID 6 0 0 0 0 0 6 1.000000 0.000000 0.000000
Token 643 4170 454 0 0 4624 643 0.000000 1.000000 0.877919 0.000000
Token 644 4170 454 0 0 4624 644 0.000000 1.000000 0.877752 0.000000
Twilio Credentials 30 39 0 0 0 39 30 0.000000 1.000000 0.565217 0.000000
URL Credentials 210 157 215 0 0 372 210 0.000000 1.000000 0.639175 0.000000
UUID 1069 265 0 0 0 265 1069 0.000000 1.000000 0.198651 0.000000
12255 49692 5101 0 0 0 49692 12255 0.000000 1.000000 0.802170 0.000000
12261 49692 5101 0 0 0 49692 12261 0.000000 1.000000 0.802092 0.000000
5 changes: 4 additions & 1 deletion benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ def get_arguments() -> Namespace:
parser.add_argument("--load",
help=f"skip scan and use prepared output",
dest="load")
parser.add_argument("--fix",
help=f"add/update markup for unknown credetials",
action="store_true")
return parser.parse_args()


def main() -> None:
args = get_arguments()
benchmark = Benchmark()
if args.scanner in SCANNER_LIST:
benchmark.run(args.scanner, args.load)
benchmark.run(args.scanner, args.load, args.fix)
else:
print(f"Please check scanner name (support: {SCANNER_LIST})")

Expand Down
8 changes: 6 additions & 2 deletions benchmark/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,13 @@ def set_cred_data(self) -> str:
subprocess.call(["./venv/bin/python", "download_data.py", "--data_dir", "data"], cwd=cred_data_path)
return cred_data_path

def run(self, scanner_type: str, output: Optional[str] = None) -> None:
def run(self, scanner_type: str, output: Optional[str] = None, fix: Optional[bool] = None) -> None:
if _scanner_type := getattr(ScannerType, scanner_type.strip().upper(), None):
scanner = ScannerFactory.create_scanner(_scanner_type, self.working_dir, self.cred_data_path, bool(output))
scanner = ScannerFactory.create_scanner(_scanner_type,
self.working_dir,
self.cred_data_path,
bool(output),
bool(fix))
else:
raise RuntimeError(f"Wrong scanner_type='{scanner_type}'")
if output:
Expand Down
7 changes: 4 additions & 3 deletions benchmark/scanner/credential_digger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

class CredentialDigger(Scanner):

def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.CREDENTIAL_DIGGER, URL.CREDENTIAL_DIGGER, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None:
super().__init__(ScannerType.CREDENTIAL_DIGGER, URL.CREDENTIAL_DIGGER, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.db"
self.working_dir: str = working_dir

Expand Down Expand Up @@ -64,4 +64,5 @@ def parse_result(self) -> None:
line_data = {"file_name": data[1], "line_number": data[2]}
if line_data["file_name"].split("/")[-1] == "LICENSE" or "COPYING" in line_data["file_name"].split("/")[-1]:
continue
_, _, _ = self.check_line_from_meta(line_data["file_name"], line_data["line_number"], line_data["line_number"])
_, _, _ = self.check_line_from_meta(line_data["file_name"], line_data["line_number"],
line_data["line_number"])
4 changes: 2 additions & 2 deletions benchmark/scanner/credsweeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class CredSweeper(Scanner):
LineStatus.NOT_IN_DB: 'N',
LineStatus.CHECKED: 'C'}

def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.CREDSWEEPER, URL.CREDSWEEPER, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix:bool) -> None:
super().__init__(ScannerType.CREDSWEEPER, URL.CREDSWEEPER, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand Down
4 changes: 2 additions & 2 deletions benchmark/scanner/detect_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


class DetectSecrets(Scanner):
def __init__(self, working_dir, cred_data_dir, preload: bool):
super().__init__(ScannerType.DETECT_SECRETS, URL.DETECT_SECRETS, working_dir, cred_data_dir, preload)
def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool):
super().__init__(ScannerType.DETECT_SECRETS, URL.DETECT_SECRETS, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand Down
6 changes: 3 additions & 3 deletions benchmark/scanner/gitleaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class Gitleaks(Scanner):
def __init__(self, working_dir, cred_data_dir, preload: bool):
super().__init__(ScannerType.GITLEAKS, URL.GITLEAKS, working_dir, cred_data_dir, preload)
def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool):
super().__init__(ScannerType.GITLEAKS, URL.GITLEAKS, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand All @@ -26,7 +26,7 @@ def init_scanner(self) -> None:
def run_scanner(self) -> None:
self.init_scanner()
subprocess.call([self.gitleaks_path, "--no-git", "-p"
f"{self.cred_data_dir}/data", "-o", self.output_dir],
f"{self.cred_data_dir}/data", "-o", self.output_dir],
cwd=self.scanner_dir)

def parse_result(self) -> None:
Expand Down
39 changes: 38 additions & 1 deletion benchmark/scanner/scanner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import binascii
import hashlib
import os
import subprocess
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Tuple, Dict, List, Any
Expand All @@ -16,10 +17,11 @@

class Scanner(ABC):
def __init__(self, scanner_type: ScannerType, scanner_url: str, working_dir: str, cred_data_dir: str,
preload: bool) -> None:
preload: bool, fix: bool) -> None:
self.scanner_type = scanner_type
self.scanner_dir: str = GitService.set_scanner_up_to_date(working_dir, scanner_url, preload)
self.cred_data_dir: str = cred_data_dir
self.fix = fix
self.line_checker: set = set()
self.result_cnt: int = 0
self.lost_cnt: int = 0
Expand Down Expand Up @@ -263,6 +265,36 @@ def check_line_from_meta(self,
self.lost_cnt += 1
self.meta_next_id += 1
print(f"NOT FOUND WITH KEY: {approximate}", flush=True)
if self.fix:
with open(f"{self.cred_data_dir}/meta/{project_id}.csv", "a") as f:
f.write(f"{str(approximate)}\n")
lost_meta = MetaRow({
"Id": self.meta_next_id,
"FileID": file_id,
"Domain": "GitHub",
"RepoName": project_id,
"FilePath": data_path,
"LineStart": line_start,
"LineEnd": line_end,
"GroundTruth": 'F',
"WithWords": 'F',
"ValueStart": value_start,
"ValueEnd": value_end,
"InURL": 'F',
"InRuntimeParameter": 'F',
"CharacterSet": '',
"CryptographyKey": '',
"PredefinedPattern": '',
"VariableNameType": '',
"Entropy": 0.0,
"Length": 0,
"Base64Encode": 'F',
"HexEncode": 'F',
"URLEncode": 'F',
"Category": rule
})
self.meta[MetaKey(data_path, line_start, line_end)] = [lost_meta]

return LineStatus.NOT_IN_DB, project_id, file_id

suggestion = "LOST:"
Expand Down Expand Up @@ -326,6 +358,11 @@ def check_line_from_meta(self,
return LineStatus.TRUE, project_id, file_id
else:
print(f"WARNING: '{rule}' is not mentioned in {row}")
if self.fix:
subprocess.check_call(
["sed", "-i",
f"s/{row.Id},\\(.*\\)/{row.Id},\\1:{rule}/",
f"{self.cred_data_dir}/meta/{row.RepoName}.csv"])
# meta has no markup for given credential
self.lost_cnt += 1
print(f"{suggestion} {approximate}", flush=True)
Expand Down
23 changes: 14 additions & 9 deletions benchmark/scanner/scanner_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,33 @@

class ScannerFactory:
@classmethod
def create_scanner(cls, scanner_type: ScannerType, working_dir: str, cred_data_dir: str, preload: bool) -> Scanner:
def create_scanner(cls,
scanner_type: ScannerType,
working_dir: str,
cred_data_dir: str,
preload: bool,
fix: bool) -> Scanner:
if scanner_type == ScannerType.CREDSWEEPER:
from benchmark.scanner import CredSweeper
return CredSweeper(working_dir, cred_data_dir, preload)
return CredSweeper(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.DETECT_SECRETS:
from benchmark.scanner import DetectSecrets
return DetectSecrets(working_dir, cred_data_dir, preload)
return DetectSecrets(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.GITLEAKS:
from benchmark.scanner import Gitleaks
return Gitleaks(working_dir, cred_data_dir, preload)
return Gitleaks(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.SHHGIT:
from benchmark.scanner import Shhgit
return Shhgit(working_dir, cred_data_dir, preload)
return Shhgit(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.CREDENTIAL_DIGGER:
from benchmark.scanner import CredentialDigger
return CredentialDigger(working_dir, cred_data_dir, preload)
return CredentialDigger(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.WRAITH:
from benchmark.scanner import Wraith
return Wraith(working_dir, cred_data_dir, preload)
return Wraith(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.TRUFFLEHOG3:
from benchmark.scanner import TruffleHog3
return TruffleHog3(working_dir, cred_data_dir, preload)
return TruffleHog3(working_dir, cred_data_dir, preload, fix)
elif scanner_type == ScannerType.TRUFFLEHOG:
from benchmark.scanner import TruffleHog
return TruffleHog(working_dir, cred_data_dir, preload)
return TruffleHog(working_dir, cred_data_dir, preload, fix)
4 changes: 2 additions & 2 deletions benchmark/scanner/shhgit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class Shhgit(Scanner):
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.SHHGIT, URL.SHHGIT, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None:
super().__init__(ScannerType.SHHGIT, URL.SHHGIT, working_dir, cred_data_dir, preload, fix)
self.output_dir = f"{self.scanner_dir}/output.csv"

@property
Expand Down
4 changes: 2 additions & 2 deletions benchmark/scanner/trufflehog.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


class TruffleHog(Scanner):
def __init__(self, working_dir, cred_data_dir, preload: bool):
super().__init__(ScannerType.TRUFFLEHOG, URL.TRUFFLEHOG, working_dir, cred_data_dir, preload)
def __init__(self, working_dir, cred_data_dir, preload: bool, fix: bool):
super().__init__(ScannerType.TRUFFLEHOG, URL.TRUFFLEHOG, working_dir, cred_data_dir, preload, fix)
self.output_dir: str = f"{self.scanner_dir}/output.json"

@property
Expand Down
9 changes: 5 additions & 4 deletions benchmark/scanner/trufflehog3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


class TruffleHog3(Scanner):
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool) -> None:
super().__init__(ScannerType.TRUFFLEHOG3, URL.TRUFFLEHOG3, working_dir, cred_data_dir, preload)
def __init__(self, working_dir: str, cred_data_dir: str, preload: bool, fix: bool) -> None:
super().__init__(ScannerType.TRUFFLEHOG3, URL.TRUFFLEHOG3, working_dir, cred_data_dir, preload, fix)
self.output_dir = f"{self.scanner_dir}/output.json"
if os.path.exists(self.output_dir):
os.remove(self.output_dir)
Expand All @@ -33,7 +33,7 @@ def run_scanner(self) -> None:
"./venv/bin/trufflehog3", f"{self.cred_data_dir}/data/", "-o", self.output_dir, "-f", "json",
"--line-numbers"
],
cwd=self.scanner_dir)
cwd=self.scanner_dir)

def parse_result(self) -> None:
with open(self.output_dir, "r") as f:
Expand All @@ -44,4 +44,5 @@ def parse_result(self) -> None:
line_data = {"path": data["path"], "line_number": int(line.split(" ")[0])}
if line_data["path"].split("/")[-1] == "LICENSE":
continue
_, _, _ = self.check_line_from_meta(line_data["path"], line_data["line_number"], line_data["line_number"])
_, _, _ = self.check_line_from_meta(line_data["path"], line_data["line_number"],
line_data["line_number"])
Loading
Loading