Skip to content

Commit

Permalink
feature: Support plugins for detectors of PII type.
Browse files Browse the repository at this point in the history
Detectors detect PII. Change PII Type to class hierarchy instead of
enums. With this change new PII types can be defined.

Support adding plugins using entry points for new detectors. Remove
spacy detector and convert it into plugin hosted in another repository.

Fix tokern#115
  • Loading branch information
vrajat committed Dec 17, 2021
1 parent 6de3c19 commit 9d51a54
Show file tree
Hide file tree
Showing 14 changed files with 565 additions and 942 deletions.
68 changes: 68 additions & 0 deletions piicatcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,70 @@
# flake8: noqa
__version__ = "0.18.2"

from dbcat.catalog.pii_types import PiiType


class Phone(PiiType):
name = "Phone"
type = "phone"
pass


class Email(PiiType):
name = "Email"
type = "email"
pass


class CreditCard(PiiType, type="credit_card"): # type: ignore
name = "Credit Card"
type = "credit_card"
pass


class Address(PiiType):
name = "Address"
type = "address"
pass


class Person(PiiType):
name = "Person"
type = "person"
pass


class BirthDate(PiiType, type="birth_date"): # type: ignore
name = "Birth Date"
type = "birth_date"
pass


class Gender(PiiType):
name = "Gender"
type = "gender"
pass


class Nationality(PiiType):
name = "Nationality"
type = "nationality"
pass


class SSN(PiiType):
name = "SSN"
type = "ssn"
pass


class UserName(PiiType, type="user_name"): # type: ignore
name = "User Name"
type = "user_name"
pass


class Password(PiiType):
name = "Password"
type = "password"
pass
50 changes: 32 additions & 18 deletions piicatcher/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,13 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from dbcat.api import init_db, open_catalog
from dbcat.api import init_db, open_catalog, scan_sources
from dbcat.catalog import Catalog, CatSource
from dbcat.catalog.db import DbScanner
from sqlalchemy.orm.exc import NoResultFound

from piicatcher.generators import (
SMALL_TABLE_MAX,
NoMatchesError,
column_generator,
data_generator,
)
from piicatcher import detectors
from piicatcher.detectors import DatumDetector, MetadataDetector, detector_registry
from piicatcher.generators import SMALL_TABLE_MAX, column_generator, data_generator
from piicatcher.output import output_dict, output_tabular
from piicatcher.scanner import deep_scan, shallow_scan

Expand Down Expand Up @@ -77,22 +73,25 @@ def scan_database(
LOGGER.debug("No last run found")

try:
scanner = DbScanner(
scan_sources(
catalog=catalog,
source=source,
include_schema_regex_str=include_schema_regex,
exclude_schema_regex_str=exclude_schema_regex,
include_table_regex_str=include_table_regex,
exclude_table_regex_str=exclude_table_regex,
source_names=[source.name],
include_schema_regex=include_schema_regex,
exclude_schema_regex=exclude_schema_regex,
include_table_regex=include_table_regex,
exclude_table_regex=exclude_table_regex,
)
try:
scanner.scan()
except StopIteration:
raise NoMatchesError

if scan_type == ScanTypeEnum.shallow:
detector_list = [
detector()
for detector in detectors.detector_registry.get_all().values()
if issubclass(detector, MetadataDetector)
]

shallow_scan(
catalog=catalog,
detectors=detector_list,
work_generator=column_generator(
catalog=catalog,
source=source,
Expand All @@ -113,8 +112,15 @@ def scan_database(
),
)
else:
detector_list = [
detector()
for detector in detectors.detector_registry.get_all().values()
if issubclass(detector, DatumDetector)
]

deep_scan(
catalog=catalog,
detectors=detector_list,
work_generator=column_generator(
catalog=catalog,
source=source,
Expand Down Expand Up @@ -157,6 +163,14 @@ def scan_database(
)


def list_detectors() -> List[str]:
return list(detector_registry.get_all().keys())


def list_detector_entry_points() -> List[str]:
return list(detector_registry.get_entry_points().keys())


def scan_sqlite(
catalog_params: Dict[str, Any],
name: str,
Expand Down
27 changes: 25 additions & 2 deletions piicatcher/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from piicatcher.api import (
OutputFormat,
ScanTypeEnum,
list_detector_entry_points,
list_detectors,
scan_athena,
scan_mysql,
scan_postgresql,
Expand All @@ -17,7 +19,6 @@
)
from piicatcher.app_state import app_state
from piicatcher.generators import SMALL_TABLE_MAX, NoMatchesError
from piicatcher.output import PiiTypeEncoder

app = typer.Typer()

Expand Down Expand Up @@ -65,7 +66,7 @@ def str_output(op, output_format: OutputFormat):
headers=("schema", "table", "column", "PII Type", "Scanner"),
)
else:
return json.dumps(op, sort_keys=True, indent=2, cls=PiiTypeEncoder)
return json.dumps(op, sort_keys=True, indent=2)


@app.command()
Expand Down Expand Up @@ -400,3 +401,25 @@ def athena(
except NoMatchesError:
typer.echo(message=NoMatchesError.message)
typer.Exit(1)


detector_app = typer.Typer()


@detector_app.command(name="list")
def cli_list_detectors():
typer.echo(
message=tabulate(
tabular_data=[(d,) for d in list_detectors()], headers=("detectors",)
)
)


@detector_app.command(name="entry-points")
def cli_list_entry_points():
typer.echo(
message=tabulate(
tabular_data=[(e,) for e in list_detector_entry_points()],
headers=("entry points",),
)
)
2 changes: 2 additions & 0 deletions piicatcher/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from piicatcher.api import OutputFormat
from piicatcher.app_state import app_state
from piicatcher.cli import app as scan_app
from piicatcher.cli import detector_app
from piicatcher.scanner import data_logger, scan_logger

app = typer.Typer()
Expand Down Expand Up @@ -133,3 +134,4 @@ def cli(


app.add_typer(scan_app, name="scan")
app.add_typer(detector_app, name="detectors")
54 changes: 54 additions & 0 deletions piicatcher/detectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import inspect
from abc import ABC, abstractmethod
from typing import Optional, Type

import catalogue
from dbcat.catalog.models import CatColumn
from dbcat.catalog.pii_types import PiiType


class Detector(ABC):
"""Scanner abstract class that defines required methods"""

name: str

pass


class MetadataDetector(Detector):
@abstractmethod
def detect(self, column: CatColumn) -> Optional[PiiType]:
"""Scan the text and return an array of PiiTypes that are found"""


class DatumDetector(Detector):
@abstractmethod
def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]:
"""Scan the text and return an array of PiiTypes that are found"""


detector_registry = catalogue.create("piicatcher", "detectors", entry_points=True)


def register_detector(detector: Type["Detector"]) -> Type["Detector"]:
"""Register a detector for use.
You can use ``register_detector(NewDetector)`` after your detector definition to automatically
register it.
.. code:: pycon
>>> import piicatcher
>>> class NewDetector(piicatcher.detectors.Detector):
... pass
>>> piicatcher.detectors.register_detector(NewDetector)
<class 'piicatcher.detectors.catalogue.NewDetector'>
:param detector: The ``Detector`` to register with the scrubadub detector configuration.
:type detector: Detector class
"""
if not inspect.isclass(detector):
raise ValueError("detector should be a class, not an instance.")

detector_registry.register(detector.name, func=detector)

return detector
17 changes: 4 additions & 13 deletions piicatcher/output.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
import datetime
import json
from typing import Any, Dict, List, Optional

from dbcat.catalog import Catalog, CatSchema, CatSource, CatTable
from dbcat.catalog.models import PiiTypes

from piicatcher.generators import column_generator


# Ref: https://stackoverflow.com/questions/24481852/serialising-an-enum-member-to-json
class PiiTypeEncoder(json.JSONEncoder):
# pylint: disable=method-hidden
def default(self, obj):
if type(obj) == PiiTypes:
return {"__enum__": str(obj)}
return json.JSONEncoder.default(self, obj)


def output_dict(
catalog: Catalog,
source: CatSource,
Expand Down Expand Up @@ -65,7 +54,9 @@ def output_dict(
"name": column.name,
"data_type": column.data_type,
"sort_order": column.sort_order,
"pii_type": column.pii_type,
"pii_type": column.pii_type.name
if column.pii_type is not None
else None,
"pii_plugin": column.pii_plugin,
}
)
Expand Down Expand Up @@ -104,7 +95,7 @@ def output_tabular(
schema.name,
table.name,
column.name,
str(column.pii_type),
column.pii_type.name if column.pii_type is not None else None,
column.pii_plugin,
]
)
Expand Down
Loading

0 comments on commit 9d51a54

Please sign in to comment.