diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt index d1092ee57..7a71002a1 100644 --- a/.ci/benchmark.txt +++ b/.ci/benchmark.txt @@ -1,6 +1,6 @@ -META MD5 5bb0a05fd77c2761b8414bba41103939 -DATA MD5 9e77a2d9f718f175264ab5a386ae86c4 -DATA: 16342283 interested lines. MARKUP: 62022 items +META MD5 491a59236c4d6280b46e0285ce2209e4 +DATA MD5 65e29f238760e1283df0f9762f9f1459 +DATA: 16342283 interested lines. MARKUP: 62023 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- 194 28318 71 418 90 @@ -63,7 +63,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .gd 1 37 1 .gml 3 3075 16 .gni 3 5017 19 -.go 1080 566476 687 4131 747 +.go 1080 566476 688 4131 747 .golden 5 1168 1 13 29 .gradle 45 3265 4 90 100 .graphql 7 420 13 @@ -82,11 +82,11 @@ FileType FileNumber ValidLines Positives Negatives Templat .ipynb 1 134 5 .j 1 241 4 .j2 30 5530 6 186 10 -.java 621 134132 368 1365 171 +.java 621 134132 370 1365 171 .jenkinsfile 1 58 2 6 .jinja2 1 64 2 -.js 659 536413 531 2497 331 -.json 851 13046493 1077 10907 140 +.js 659 536413 533 2497 331 +.json 851 13046493 1079 10907 140 .jsp 13 3202 1 40 .jsx 7 857 19 .jwt 1 1 2 @@ -153,7 +153,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .pug 2 193 2 .purs 1 69 4 .pxd 1 150 5 2 -.py 890 291553 681 3303 726 +.py 890 291553 682 3303 726 .pyi 4 1361 9 .pyp 1 167 1 .pyx 2 1094 23 @@ -206,10 +206,10 @@ FileType FileNumber ValidLines Positives Negatives Templat .toml 83 2379 53 105 156 .tpl 1 43 1 .travis 1 34 4 3 1 -.ts 583 106730 157 1800 203 +.ts 583 106730 158 1800 203 .tsx 54 7914 1 114 5 .ttar 1 452 1 -.txt 440 78102 5287 6354 49 +.txt 440 78102 5289 6354 49 .utf8 1 77 2 .vsixmanifest 1 36 1 .vsmdi 1 6 2 @@ -222,7 +222,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .yml 419 36169 559 889 376 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10232 16342283 12261 49692 5101 +TOTAL: 10232 16342283 12272 49692 5101 credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ---- @@ -233,7 +233,7 @@ AWS S3 Bucket 67 23 0 Atlassian Old PAT token 27 308 3 0 0 311 27 0.000000 1.000000 0.920118 0.000000 Auth 417 2739 82 0 0 2821 417 0.000000 1.000000 0.871217 0.000000 Azure Access Token 19 0 0 0 0 0 19 1.000000 0.000000 0.000000 -BASE64 Private Key 7 4 0 0 0 4 7 0.000000 1.000000 0.363636 0.000000 +BASE64 Private Key 12 4 0 0 0 4 12 0.000000 1.000000 0.250000 0.000000 BASE64 encoded PEM Private Key 7 0 0 0 0 0 7 1.000000 0.000000 0.000000 Bitbucket Client ID 143 2095 9 0 0 2104 143 0.000000 1.000000 0.936360 0.000000 Bitbucket Client Secret 301 807 10 0 0 817 301 0.000000 1.000000 0.730769 0.000000 @@ -270,5 +270,5 @@ Tencent WeChat API App ID 6 0 0 Token 644 4170 454 0 0 4624 644 0.000000 1.000000 0.877752 0.000000 Twilio Credentials 30 39 0 0 0 39 30 0.000000 1.000000 0.565217 0.000000 URL Credentials 210 157 215 0 0 372 210 0.000000 1.000000 0.639175 0.000000 -UUID 1069 265 0 0 0 265 1069 0.000000 1.000000 0.198651 0.000000 - 12261 49692 5101 0 0 0 49692 12261 0.000000 1.000000 0.802092 0.000000 +UUID 1075 265 0 0 0 265 1075 0.000000 1.000000 0.197761 0.000000 + 12272 49692 5101 0 0 0 49692 12272 0.000000 1.000000 0.801950 0.000000 diff --git a/benchmark/scanner/scanner.py b/benchmark/scanner/scanner.py index 81c68c966..e1b13b629 100644 --- a/benchmark/scanner/scanner.py +++ b/benchmark/scanner/scanner.py @@ -260,41 +260,40 @@ def check_line_from_meta(self, f",{line_start},{line_end}" \ f",F,F,{value_start},{value_end}" \ f",F,F,,,,,0.0,0,F,F,F,{rule}" + lost_meta = MetaRow({ + "Id": self.meta_next_id, + "FileID": file_id, + "Domain": "GitHub", + "RepoName": project_id, + "FilePath": data_path, + "LineStart": line_start, + "LineEnd": line_end, + "GroundTruth": 'F', + "WithWords": 'F', + "ValueStart": value_start, + "ValueEnd": value_end, + "InURL": 'F', + "InRuntimeParameter": 'F', + "CharacterSet": '', + "CryptographyKey": '', + "PredefinedPattern": '', + "VariableNameType": '', + "Entropy": 0.0, + "Length": 0, + "Base64Encode": 'F', + "HexEncode": 'F', + "URLEncode": 'F', + "Category": rule + }) if not (rows := self.meta.get(MetaKey(data_path, line_start, line_end))): self.lost_cnt += 1 - self.meta_next_id += 1 print(f"NOT FOUND WITH KEY: {approximate}", flush=True) if self.fix: with open(f"{self.cred_data_dir}/meta/{project_id}.csv", "a") as f: f.write(f"{str(approximate)}\n") - lost_meta = MetaRow({ - "Id": self.meta_next_id, - "FileID": file_id, - "Domain": "GitHub", - "RepoName": project_id, - "FilePath": data_path, - "LineStart": line_start, - "LineEnd": line_end, - "GroundTruth": 'F', - "WithWords": 'F', - "ValueStart": value_start, - "ValueEnd": value_end, - "InURL": 'F', - "InRuntimeParameter": 'F', - "CharacterSet": '', - "CryptographyKey": '', - "PredefinedPattern": '', - "VariableNameType": '', - "Entropy": 0.0, - "Length": 0, - "Base64Encode": 'F', - "HexEncode": 'F', - "URLEncode": 'F', - "Category": rule - }) self.meta[MetaKey(data_path, line_start, line_end)] = [lost_meta] - + self.meta_next_id += 1 return LineStatus.NOT_IN_DB, project_id, file_id suggestion = "LOST:" @@ -363,10 +362,17 @@ def check_line_from_meta(self, ["sed", "-i", f"s/{row.Id},\\(.*\\)/{row.Id},\\1:{rule}/", f"{self.cred_data_dir}/meta/{row.RepoName}.csv"]) + self.meta[MetaKey(data_path, line_start, line_end)].append(lost_meta) + lost_meta = None + # meta has no markup for given credential self.lost_cnt += 1 print(f"{suggestion} {approximate}", flush=True) self.meta_next_id += 1 + if lost_meta and self.fix: + with open(f"{self.cred_data_dir}/meta/{project_id}.csv", "a") as f: + f.write(f"{str(approximate)}\n") + self.meta[MetaKey(data_path, line_start, line_end)].append(lost_meta) return LineStatus.NOT_IN_DB, project_id, file_id def analyze_result(self) -> None: diff --git a/meta/0f133e09.csv b/meta/0f133e09.csv index 2370befcf..bb131f3ca 100644 --- a/meta/0f133e09.csv +++ b/meta/0f133e09.csv @@ -1770,3 +1770,6 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 1351521,c7b13616,GitHub,0f133e09,data/0f133e09/test/c7b13616.txt,9,9,T,F,1250,1302,F,F,,,,,0.0,0,F,F,F,AWS S3 Bucket 1351522,c7b13616,GitHub,0f133e09,data/0f133e09/test/c7b13616.txt,9,9,T,F,1584,1613,F,F,,,,,0.0,0,F,F,F,AWS S3 Bucket 1479613,03c77d4f,GitHub,0f133e09,data/0f133e09/src/03c77d4f.py,314,314,F,F,49,54,F,F,,,,,0.0,0,F,F,F,Auth +1480549,03cdc0c5,GitHub,0f133e09,data/0f133e09/test/03cdc0c5.py,95,95,T,F,397,433,F,F,,,,,0.0,0,F,F,F,UUID +1480550,c65fed08,GitHub,0f133e09,data/0f133e09/test/c65fed08.txt,9,9,T,F,668,704,F,F,,,,,0.0,0,F,F,F,UUID +1480551,c65fed08,GitHub,0f133e09,data/0f133e09/test/c65fed08.txt,10,10,T,F,384,420,F,F,,,,,0.0,0,F,F,F,UUID diff --git a/meta/49e2a965.csv b/meta/49e2a965.csv index 31c4d6354..08b5b315e 100644 --- a/meta/49e2a965.csv +++ b/meta/49e2a965.csv @@ -56,3 +56,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 108549,ae95a565,GitHub,49e2a965,data/49e2a965/other/ae95a565.md,32,32,F,F,-1,-1,F,F,,,,,0.0,-1,F,F,F,Other 114654,ae95a565,GitHub,49e2a965,data/49e2a965/other/ae95a565.md,195,195,F,F,,,F,F,,,,,0.00,,F,F,F,Auth 131632,ae95a565,GitHub,49e2a965,data/49e2a965/other/ae95a565.md,72,72,F,F,,,F,F,,,,,0.00,,F,F,F,Password +1480555,76ccf172,GitHub,49e2a965,data/49e2a965/test/76ccf172.ts,74,74,T,F,46,,F,F,,,,,0.0,0,F,F,F,BASE64 Private Key diff --git a/meta/5cecf769.csv b/meta/5cecf769.csv index 94479d8da..157f0c768 100644 --- a/meta/5cecf769.csv +++ b/meta/5cecf769.csv @@ -466,3 +466,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 133375,3184cf27,GitHub,5cecf769,data/5cecf769/src/3184cf27.cc,518,518,F,F,,,F,F,,,,,0,0,F,F,F,API 133376,c8e8fa46,GitHub,5cecf769,data/5cecf769/src/c8e8fa46.bzl,19,19,F,F,,,F,F,,,,,0,0,F,F,F,API 133377,c8e8fa46,GitHub,5cecf769,data/5cecf769/src/c8e8fa46.bzl,78,78,F,F,,,F,F,,,,,0,0,F,F,F,API +1480556,724f0a84,GitHub,5cecf769,data/5cecf769/test/724f0a84.json,5,5,T,F,51,,F,F,,,,,0.0,0,F,F,F,BASE64 Private Key diff --git a/meta/81cd05d0.csv b/meta/81cd05d0.csv index 5ff0c0964..924963cae 100644 --- a/meta/81cd05d0.csv +++ b/meta/81cd05d0.csv @@ -5252,3 +5252,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 1341487,f02052a6,GitHub,81cd05d0,data/81cd05d0/src/f02052a6.json,3761,3761,T,F,1148,1184,F,F,,,,,0.0,0,F,F,F,UUID:Token 1479614,334b19eb,GitHub,81cd05d0,data/81cd05d0/src/334b19eb.json,32577,32577,T,F,28,,F,F,,,,,0.0,0,F,F,F,Secret 1479615,334b19eb,GitHub,81cd05d0,data/81cd05d0/src/334b19eb.json,32790,32790,T,F,28,,F,F,,,,,0.0,0,F,F,F,Secret +1480509,357f73fe,GitHub,81cd05d0,data/81cd05d0/src/357f73fe.json,108443,108443,T,F,247,283,F,F,,,,,0.0,0,F,F,F,UUID diff --git a/meta/8ba59c91.csv b/meta/8ba59c91.csv index d7530bc89..5be4bb000 100644 --- a/meta/8ba59c91.csv +++ b/meta/8ba59c91.csv @@ -1019,3 +1019,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 133847,73c8b62a,GitHub,8ba59c91,data/8ba59c91/src/73c8b62a.json,241,241,F,F,,,F,F,,,,,0,0,F,F,F,Auth 133848,ad18488c,GitHub,8ba59c91,data/8ba59c91/src/ad18488c.json,241,241,F,F,,,F,F,,,,,0,0,F,F,F,Auth 133849,ec71fabd,GitHub,8ba59c91,data/8ba59c91/src/ec71fabd.json,241,241,F,F,,,F,F,,,,,0,0,F,F,F,Auth +1480572,f7e33e13,GitHub,8ba59c91,data/8ba59c91/src/f7e33e13.js,19,19,T,F,55,,F,F,,,,,0.0,0,F,F,F,BASE64 Private Key diff --git a/meta/e0b41e26.csv b/meta/e0b41e26.csv index df8c59b82..0895cea4c 100644 --- a/meta/e0b41e26.csv +++ b/meta/e0b41e26.csv @@ -134,3 +134,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 138124,2a34abd7,GitHub,e0b41e26,data/e0b41e26/test/2a34abd7.ndjson,4,4,F,F,102,122,F,F,,,,,0.0,0,F,F,F,Token 138125,2a34abd7,GitHub,e0b41e26,data/e0b41e26/test/2a34abd7.ndjson,6,6,F,F,104,142,F,F,,,,,0.0,0,F,F,F,Token 1013650,508b8489,GitHub,e0b41e26,data/e0b41e26/test/508b8489.js,189,189,Template,F,34,48,F,F,,,,,0.0,0,F,F,F,Token +1480573,c72880b5,GitHub,e0b41e26,data/e0b41e26/test/c72880b5.js,332,332,T,F,43,,F,F,,,,,0.0,0,F,F,F,BASE64 Private Key diff --git a/meta/ec138349.csv b/meta/ec138349.csv index acd04d719..b7adc03e9 100644 --- a/meta/ec138349.csv +++ b/meta/ec138349.csv @@ -184,6 +184,6 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 1338530,18b43943,GitHub,ec138349,data/ec138349/test/18b43943.java,47,47,F,F,6,101,F,F,,,,,0.0,0,F,F,F,JSON Web Token 1338573,2f9b15a9,GitHub,ec138349,data/ec138349/test/2f9b15a9.java,125,125,Template,F,58,65,F,F,,,,,0.0,0,F,F,F,Auth:Token 1338575,2f9b15a9,GitHub,ec138349,data/ec138349/test/2f9b15a9.java,158,158,F,F,58,68,F,F,,,,,0.0,0,F,F,F,Auth:Token -1480452,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,52,52,T,F,135,171,F,F,,,,,0.0,0,F,F,F,Auth:Nonce +1480452,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,52,52,T,F,135,171,F,F,,,,,0.0,0,F,F,F,Auth:Nonce:UUID 1480456,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,64,64,T,F,148,167,F,F,,,,,0.0,0,F,F,F,Auth:Nonce -1480457,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,52,52,T,F,256,292,F,F,,,,,0.0,0,F,F,F,Auth:Token +1480457,399221f4,GitHub,ec138349,data/ec138349/test/399221f4.java,52,52,T,F,256,292,F,F,,,,,0.0,0,F,F,F,Auth:Token:UUID diff --git a/meta/f008dd40.csv b/meta/f008dd40.csv index 9c62a1f90..88e7214fe 100644 --- a/meta/f008dd40.csv +++ b/meta/f008dd40.csv @@ -456,3 +456,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value 134727,a40d8da9,GitHub,f008dd40,data/f008dd40/test/a40d8da9.go,274,274,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key 134728,c592e77b,GitHub,f008dd40,data/f008dd40/test/c592e77b.go,226,226,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key 1036094,86db8848,GitHub,f008dd40,data/f008dd40/src/86db8848.yml,12,12,T,F,130,138,F,F,Any,,,Secret,4.94,68,F,F,F,Password +1480574,813dc2b8,GitHub,f008dd40,data/f008dd40/test/813dc2b8.go,11,11,T,F,64,,F,F,,,,,0.0,0,F,F,F,BASE64 Private Key