DefectDojo · blakeaowens · Feb 28, 2024 · Nov 28, 2023 · Nov 28, 2023 · Dec 4, 2023
diff --git a/docs/content/en/integrations/parsers/file/noseyparker.md b/docs/content/en/integrations/parsers/file/noseyparker.md
@@ -0,0 +1,107 @@
+---
+title: "Nosey Parker"
+toc_hide: true
+---
+Input Type:
+-
+This parser takes JSON Lines Output from Nosey Parker. Supports version 0.16.0 of https://github.com/praetorian-inc/noseyparker 
+
+Things to note about the Nosey Parker Parser:
+- 
+- All findings are marked with a severity of 'High'
+- The deduplication algorithm marks a unique finding by the secret, filepath, and line number all together
+- The Nosey Parker tool allows for both full history scans of a repo and targeted branch scans
+   - The Parser does NOT differentiate between the 2 scan types (may be future functionality)
+
+   - **For full history scans:** 
+     - The scan will pick up secrets committed in the past that have since been removed
+     - If a secret is removed from source code, it will still show up in the next scan
+     - When importing findings via the Dojo API, make sure to use the parameter `do_not_reactivate`  which will keep existing findings closed, without reactivating them
+    - **For targeted branch scans:**
+      - Keep in mind there may be active secrets that are either in the git history or not in the current branch
+
+Acceptable JSON Lines file:
+-
+Each line of the JSON Lines file from NoseyParker is one secret, but it can have multiple matches within the repository. All properties are required by the parser.
+
+The following is an example of an acceptable JSON lines file:
+~~~
+{"type": "finding", "rule_name": "Generic Password (double quoted)", "match_content": "32ui1ffdasfhu239b4df2ac6609a9919", "num_matches": 2, "status": null, "comment": null, "matches": [ { "provenance": [ { "kind": "file", "path": "app/schema/config.py" }, { "kind": "git_repo", "repo_path": "./.git", "commit_provenance": { "commit_kind": "first_seen", "commit_metadata": { "commit_id": "0ef84b84c29924b210e3576f69d1e8632948bedc", "committer_name": "Princess Leia", "committer_email": "leia@test.com", "committer_timestamp": "1685495256 +0000", "author_name": "Princess Leia", "author_email": "leia@test.com", "author_timestamp": "1685495256 +0000", "message": "first commit\n" }, "blob_path": "app/schema/config.py" } } ], "blob_metadata": { "id": "0ee84b84c29924b210e3576fe9d1e8632948bedc", "num_bytes": 664, "mime_essence": "text/plain", "charset": null }, "blob_id": "0ee84b84c29924b210e3576fe9d1e8632948bedc", "location": { "offset_span": { "start": 617, "end": 660 }, "source_span": { "start": { "line": 16, "column": 17 }, "end": { "line": 16, "column": 59 } } }, "capture_group_index": 1, "match_content": "32ui1ffdasfhu239b4df2ac6609a9919", "snippet": { "before": "E = \"https://testwebsite.com\"\n ", "matching": "API_KEY = \"32ui1ffdasfhu239b4df2ac6609a9919", "after": "\"\n\n\n" }, "rule_name": "Generic API Key" } ] }{"type":"finding","rule_name":"Generic Username and Password (unquoted)","match_content":"secret","num_matches":1,"matches":[{"provenance":[{"kind":"file","path":"./app/schema/config.py"},{"kind":"git_repo","repo_path":"./.git","commit_provenance":{"commit_kind":"first_seen","commit_metadata":{"commit_id":"0ee84b84c29924b210e3576fe9d1e8632948bedc","committer_name":"Princess Leia","committer_email":"leia@test.com","committer_timestamp":"1685495256 +0000","author_name":"Princess Leia","author_email":"leia@test.com","author_timestamp":"1685495256 +0000","message":"framework\n"},"blob_path":"app/schema/config.py"}}],"blob_metadata":{"id":"0ee84b84c29924b210e3576fe9d1e8632948bedc","num_bytes":664,"mime_essence":"text/plain","charset":null},"blob_id":"0ee84b84c29924b210e3576fe9d1e8632948bedc","location":{"offset_span":{"start":617,"end":660},"source_span":{"start":{"line":16,"column":17},"end":{"line":16,"column":59}}},"capture_group_index":1,"match_content":"secret","snippet":{"before":"E = \"https://testwebsite.com\"\n ","matching":"secret","after":"testing\"\n\n\n"},"rule_name":"Generic Username and Password (unquoted)"}]}
+{"type":"finding","rule_name":"Generic Username and Password (unquoted)","match_content":"secret","num_matches":1,"matches":[{"provenance":[{"kind":"file","path":"./app/schema/config.py"},{"kind":"git_repo","repo_path":"./.git","commit_provenance":{"commit_kind":"first_seen","commit_metadata":{"commit_id":"0ee84b84c29924b210e3576fe9d1e8632948bedc","committer_name":"Princess Leia","committer_email":"leia@test.com","committer_timestamp":"1685495256 +0000","author_name":"Princess Leia","author_email":"leia@test.com","author_timestamp":"1685495256 +0000","message":"framework\n"},"blob_path":"app/schema/config.py"}}],"blob_metadata":{"id":"0ee84b84c29924b210e3576fe9d1e8632948bedc","num_bytes":664,"mime_essence":"text/plain","charset":null},"blob_id":"0ee84b84c29924b210e3576fe9d1e8632948bedc","location":{"offset_span":{"start":617,"end":660},"source_span":{"start":{"line":16,"column":17},"end":{"line":16,"column":59}}},"capture_group_index":1,"match_content":"secret","snippet":{"before":"E = \"https://testwebsite.com\"\n ","matching":"secret","after":"testing\"\n\n\n"},"rule_name":"Generic Username and Password (unquoted)"}]}
+
+~~~
+
+If the first line is expanded, it looks like this:
+
+~~~
+{
+    "type": "finding",
+    "rule_name": "Generic Password (double quoted)",
+    "match_content": "32ui1ffdasfhu239b4df2ac6609a9919",
+    "num_matches": 2,
+    "status": null,
+    "comment": null,
+    "matches": [
+        {
+            "provenance": [
+                {
+                    "kind": "file",
+                    "path": "app/schema/config.py"
+                },
+                {
+                    "kind": "git_repo",
+                    "repo_path": "./.git",
+                    "commit_provenance": {
+                        "commit_kind": "first_seen",
+                        "commit_metadata": {
+                            "commit_id": "0ef84b84c29924b210e3576f69d1e8632948bedc",
+                            "committer_name": "Princess Leia",
+                            "committer_email": "leia@test.com",
+                            "committer_timestamp": "1685495256 +0000",
+                            "author_name": "Princess Leia",
+                            "author_email": "leia@test.com",
+                            "author_timestamp": "1685495256 +0000",
+                            "message": "first commit\n"
+                        },
+                        "blob_path": "app/schema/config.py"
+                    }
+                }
+            ],
+            "blob_metadata": {
+                "id": "0ee84b84c29924b210e3576fe9d1e8632948bedc",
+                "num_bytes": 664,
+                "mime_essence": "text/plain",
+                "charset": null
+            },
+            "blob_id": "0ee84b84c29924b210e3576fe9d1e8632948bedc",
+            "location": {
+                "offset_span": {
+                    "start": 617,
+                    "end": 660
+                },
+                "source_span": {
+                    "start": {
+                        "line": 16,
+                        "column": 17
+                    },
+                    "end": {
+                        "line": 16,
+                        "column": 59
+                    }
+                }
+            },
+            "capture_group_index": 1,
+            "match_content": "32ui1ffdasfhu239b4df2ac6609a9919",
+            "snippet": {
+                "before": "E = \"https://testwebsite.com\"\n ",
+                "matching": "API_KEY = \"32ui1ffdasfhu239b4df2ac6609a9919",
+                "after": "\"\n\n\n"
+            },
+            "rule_name": "Generic API Key"
+        }
+    ]
+}
+~~~
+
+### Sample Scan Data
+Sample scan data for testing purposes can be found [here](https://github.com/DefectDojo/django-DefectDojo/tree/master/unittests/scans/noseyparker).
diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py
@@ -1451,6 +1451,7 @@ def saml2_attrib_map_format(dict):
     'MSDefender Parser': DEDUPE_ALGO_HASH_CODE,
     'HCLAppScan XML': DEDUPE_ALGO_HASH_CODE,
     'MobSF Scan': DEDUPE_ALGO_HASH_CODE,
+    'Nosey Parker Scan': DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE,
 }
 
 # Override the hardcoded settings here via the env var

diff --git a/dojo/tools/noseyparker/__init__.py b/dojo/tools/noseyparker/__init__.py
diff --git a/dojo/tools/noseyparker/parser.py b/dojo/tools/noseyparker/parser.py
@@ -0,0 +1,101 @@
+import hashlib
+import json
+
+from datetime import datetime
+from dojo.models import Finding
+
+
+class NoseyParkerParser(object):
+    """
+    Scanning secrets from repos
+    """
+
+    def get_scan_types(self):
+        return ["Nosey Parker Scan"]
+
+    def get_label_for_scan_types(self, scan_type):
+        return "Nosey Parker Scan"
+
+    def get_description_for_scan_types(self, scan_type):
+        return "Nosey Parker report file can be imported in JSON Lines format (option --jsonl). " \
+               "Supports v0.16.0 of https://github.com/praetorian-inc/noseyparker"
+
+    def get_findings(self, file, test):
+        """
+        Returns findings from jsonlines file and uses filter
+        to skip findings and determine severity
+        """
+        dupes = {}
+
+        # Turn JSONL file into DataFrame
+        if file is None:
+            return
+        elif file.name.lower().endswith(".jsonl"):
+            # Process JSON lines into Dict
+            data = [json.loads(line) for line in file]
+
+            # Check for empty file
+            if len(data[0]) == 0:
+                return []
+
+            # Parse through each secret in each JSON line
+            for line in data:
+                # Set rule to the current secret type (e.g. AWS S3 Bucket)
+                try:
+                    rule_name = line['rule_name']
+                    secret = line['match_content']
+                except Exception:
+                    raise ValueError("Invalid Nosey Parker data, make sure to use Nosey Parker v0.16.0")
+
+                # Set Finding details
+                for match in line['matches']:
+                    # The following path is to account for the variability in the JSON lines output
+                    num_elements = len(match['provenance']) - 1
+                    json_path = match['provenance'][num_elements]
+
+                    title = f"Secret(s) Found in Repository with Commit ID {json_path['commit_provenance']['commit_metadata']['commit_id']}"
+                    filepath = json_path['commit_provenance']['blob_path']
+                    line_num = match['location']['source_span']['start']['line']
+                    description = f"Secret found of type:   {rule_name} \n" \
+                                  f"SECRET starts with:  '{secret[:3]}' \n" \
+                                  f"Committer Name: {json_path['commit_provenance']['commit_metadata']['committer_name']}  \n" \
+                                  f"Committer Email: {json_path['commit_provenance']['commit_metadata']['committer_email']} \n" \
+                                  f"Commit ID: {json_path['commit_provenance']['commit_metadata']['commit_id']}  \n" \
+                                  f"Location: {filepath} line #{line_num} \n " \
+                                  f"Line #{line_num} \n " \
+                                  f"Code Snippet Containing Secret: {match['snippet']['before']}***SECRET***{match['snippet']['after']} \n"
+
+                    # Internal de-duplication
+                    key = hashlib.md5((filepath + "|" + secret + "|" + str(line_num)).encode("utf-8")).hexdigest()
+
+                    # If secret already exists with the same filepath/secret/linenum
+                    if key in dupes:
+                        finding = dupes[key]
+                        finding.nb_occurences += 1
+                        dupes[key] = finding
+                    else:
+                        dupes[key] = True
+                        # Create Finding object
+                        finding = Finding(
+                            test=test,
+                            cwe=798,
+                            title=title,
+                            description=description,
+                            severity='High',
+                            mitigation="Reset the account/token and remove from source code. Store secrets/tokens/passwords in secret managers or secure vaults.",
+                            date=datetime.today().strftime("%Y-%m-%d"),
+                            verified=False,
+                            active=True,
+                            is_mitigated=False,
+                            file_path=filepath,
+                            line=line_num,
+                            static_finding=True,
+                            nb_occurences=1,
+                            dynamic_finding=False
+
+                        )
+                        dupes[key] = finding
+        else:
+            raise ValueError("JSON lines format not recognized (.jsonl file extension). Make sure to use Nosey Parker v0.16.0")
+
+        return list(dupes.values())
diff --git a/unittests/scans/noseyparker/empty_with_error.json b/unittests/scans/noseyparker/empty_with_error.json
@@ -0,0 +1,5 @@
+{"type":"warning","data":"package.json: No license field"}
+{"type":"warning","data":"No license field"}
+{"type":"error","data":"An unexpected error occurred: \"https://registry.yarnpkg.com/-/npm/v1/security/audits: tunneling socket could not be established, cause=connect ECONNREFUSED 127.0.0.1:80\"."}
+{"type":"info","data":"If you think this is a bug, please open a bug report with the information provided in \"/yarn-error.log\"."}
+{"type":"info","data":"Visit https://yarnpkg.com/en/docs/cli/audit for documentation about this command."}