Skip to content

Commit

Permalink
feat(privacy report): Add subject name to datatype (#468)
Browse files Browse the repository at this point in the history
* feat: pass subject name from classification to risk data types

* feat: add mapping for data subjects

* fix: clean up schema classification and add subject name to child data type classification

* fix: remove unnecessary classification request

* feat: add subject name to data types

* fix: snapshots

* fix: fix schema test

* fix: fix snapshots

* fix: update new relic snapshots
  • Loading branch information
elsapet authored Jan 31, 2023
1 parent e4ad4ef commit e86b4c0
Show file tree
Hide file tree
Showing 61 changed files with 405 additions and 604 deletions.
2 changes: 1 addition & 1 deletion integration/flags/.snapshots/TestOuputFlag
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[{"detector_type":"ruby","source":{"column_number":8,"filename":"main.rb","language":"Ruby","language_type":"programming","line_number":1,"text":null},"type":"schema_classified","value":{"classification":{"decision":{"reason":"","state":""},"name":"info"},"field_name":"info","field_type":"","field_type_simple":"unknown","normalized_field_name":"info","normalized_object_name":"logger","object_name":"logger"}},{"detector_type":"ruby","source":{"column_number":31,"filename":"main.rb","language":"Ruby","language_type":"programming","line_number":1,"text":null},"type":"schema_classified","value":{"classification":{"data_type":{"category_uuid":"cef587dd-76db-430b-9e18-7b031e1a193b","name":"Email Address","uuid":"22e24c62-82d3-4b72-827c-e261533331bd"},"decision":{"reason":"known_pattern","state":"valid"},"name":"email"},"field_name":"email","field_type":"","field_type_simple":"unknown","normalized_field_name":"email","normalized_object_name":"user","object_name":"user"}}]
[{"detector_type":"ruby","source":{"column_number":8,"filename":"main.rb","language":"Ruby","language_type":"programming","line_number":1,"text":null},"type":"schema_classified","value":{"classification":{"decision":{"reason":"","state":""},"name":"info"},"field_name":"info","field_type":"","field_type_simple":"unknown","normalized_field_name":"info","normalized_object_name":"logger","object_name":"logger"}},{"detector_type":"ruby","source":{"column_number":31,"filename":"main.rb","language":"Ruby","language_type":"programming","line_number":1,"text":null},"type":"schema_classified","value":{"classification":{"data_type":{"category_uuid":"cef587dd-76db-430b-9e18-7b031e1a193b","name":"Email Address","uuid":"22e24c62-82d3-4b72-827c-e261533331bd"},"decision":{"reason":"known_pattern","state":"valid"},"name":"email","subject_name":"User"},"field_name":"email","field_type":"","field_type_simple":"unknown","normalized_field_name":"email","normalized_object_name":"user","object_name":"user"}}]

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{"data_types":[{"name":"Email Address","detectors":[{"name":"ruby","locations":[{"filename":"integration/flags/testdata/simple/main.rb","line_number":1,"field_name":"email","object_name":"user"}]}]}],"components":[]}
{"data_types":[{"name":"Email Address","detectors":[{"name":"ruby","locations":[{"filename":"integration/flags/testdata/simple/main.rb","line_number":1,"field_name":"email","object_name":"user","subject_name":"User"}]}]}],"components":[]}

--

34 changes: 27 additions & 7 deletions new/detector/composition/ruby/.snapshots/TestRuby-object.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,35 @@
}
},
"Classification": {
"name": "name",
"data_type": {
"name": "Fullname",
"uuid": "1617291b-bc22-4267-ad5e-8054b2505958",
"category_uuid": "14124881-6b92-4fc5-8005-ea7c1c09592e"
},
"name": "user",
"decision": {
"state": "valid",
"reason": "known_pattern"
"reason": "valid_object_with_valid_properties"
}
},
"Properties": [
{
"Name": "name",
"Detection": {
"MatchNode": {},
"ContextNode": null,
"Data": {
"Name": "name"
}
},
"Classification": {
"name": "name",
"subject_name": "User",
"data_type": {
"name": "Fullname",
"uuid": "1617291b-bc22-4267-ad5e-8054b2505958",
"category_uuid": "14124881-6b92-4fc5-8005-ea7c1c09592e"
},
"decision": {
"state": "valid",
"reason": "known_pattern"
}
}
}
}
}
Expand Down
20 changes: 20 additions & 0 deletions pkg/classification/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ var dataTypeClassificationPatternsDir embed.FS
//go:embed known_person_object_patterns
var knownPersonObjectPatternsDir embed.FS

//go:embed subject_mapping.json
var subjectMappingFile embed.FS

//go:embed category_grouping.json
var categoryGroupingFile embed.FS

Expand Down Expand Up @@ -123,6 +126,7 @@ type KnownPersonObjectPattern struct {
ExcludeRegexp string `json:"exclude_regexp,omitempty"`
ExcludeRegexpMatcher *regexp.Regexp `json:"exclude_regexp_matcher" yaml:"exclude_regexp_matcher"`
Category string `json:"category" yaml:"category"`
SubjectName string `json:"subject_name,omitempty" yaml:"subject_name,omitempty"`
ActAsIdentifier bool `json:"act_as_identifier" yaml:"act_as_identifier"`
IdentifierRegexpMatcher *regexp.Regexp `json:"identifier_regexp_matcher" yaml:"identifier_regexp_matcher"`
}
Expand Down Expand Up @@ -347,6 +351,19 @@ func defaultKnownPersonObjectPatterns(dataTypes []DataType) []KnownPersonObjectP
handleError(err)
}

// read mapping
subjectMappingJson, err := subjectMappingFile.ReadFile("subject_mapping.json")
if err != nil {
handleError(err)
}

var subjectMapping map[string]string
rawBytes := []byte(subjectMappingJson)
err = json.Unmarshal(rawBytes, &subjectMapping)
if err != nil {
handleError(err)
}

for _, file := range files {
val, err := knownPersonObjectPatternsDir.ReadFile("known_person_object_patterns/" + file.Name())
if err != nil {
Expand Down Expand Up @@ -383,6 +400,9 @@ func defaultKnownPersonObjectPatterns(dataTypes []DataType) []KnownPersonObjectP
if err != nil {
handleError(err)
}

// add subject name from mapping, if available
knownPersonObjectPattern.SubjectName = subjectMapping[knownPersonObjectPattern.Category]
}

knownPersonObjectPatterns = append(knownPersonObjectPatterns, knownPersonObjectPattern)
Expand Down
72 changes: 72 additions & 0 deletions pkg/classification/db/subject_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"Advisor": "User",
"Applicant": "User",
"Artist": "User",
"Attendee": "User",
"Author": "User",
"Bank account": "User",
"Buyer": "User",
"Caller": "User",
"Candidate": "User",
"Cardholder": "User",
"Client": "User",
"Coach": "User",
"Collaborator": "User",
"Commenter": "User",
"Competitor": "User",
"Consumer": "User",
"Contact": "User",
"Contractor": "User",
"Contributor": "User",
"Customer": "User",
"Developer": "User",
"Doctor": "User",
"Driver": "User",
"Employee": "User",
"Employer": "User",
"Follower": "User",
"Guardian": "User",
"Insuree": "User",
"Issuer": "User",
"Landlord": "User",
"Lead": "User",
"Learner": "User",
"Member": "User",
"Mentor": "User",
"Moderator": "User",
"Organizer": "User",
"Owner": "User",
"Panelist": "User",
"Participant": "User",
"Partner": "User",
"Passenger": "User",
"Patient": "User",
"Payee": "User",
"Payer": "User",
"Payor": "User",
"People": "User",
"Person": "User",
"Player": "User",
"Profile": "User",
"Reader": "User",
"Receiver": "User",
"Recipient": "User",
"Recruit": "User",
"Referee": "User",
"Referer": "User",
"Registrant": "User",
"Seller": "User",
"Sender": "User",
"Shareholder": "User",
"Shopper": "User",
"Signer": "User",
"Student": "User",
"Subscriber": "User",
"Supplier": "User",
"Teacher": "User",
"Trainer": "User",
"User": "User",
"Veteran": "User",
"Volunteer": "User",
"Winner": "User"
}
66 changes: 40 additions & 26 deletions pkg/classification/schema/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ func (datatype ClassifiedDatatype) GetClassification() interface{} {
}

type Classification struct {
Name string `json:"name" yaml:"name"`
DataType *db.DataType `json:"data_type,omitempty"`
Decision classify.ClassificationDecision `json:"decision" yaml:"decision"`
Name string `json:"name" yaml:"name"`
SubjectName *string `json:"subject_name,omitempty" yaml:"subject_name,omitempty"`
DataType *db.DataType `json:"data_type,omitempty"`
Decision classify.ClassificationDecision `json:"decision" yaml:"decision"`
}

type Classifier struct {
Expand Down Expand Up @@ -98,9 +99,15 @@ func (classifier *Classifier) Classify(data ClassificationRequest) *ClassifiedDa

matchedKnownPersonObject := classifier.matchKnownPersonObjectPatterns(normalizedName, false)
if matchedKnownPersonObject != nil {
// add data type to object
classifiedDatatype.Classification.DataType = &matchedKnownPersonObject.DataType
return classifier.classifyKnownObject(classifiedDatatype, data.Value, data.DetectorType)
// FIXME: remove unimplemented fallback (possibly causing Unique Id bug)
// classifiedDatatype.Classification.DataType = &matchedKnownPersonObject.DataType

var subjectName *string
if matchedKnownPersonObject.SubjectName != "" {
subjectName = &matchedKnownPersonObject.SubjectName
}

return classifier.classifyKnownObject(classifiedDatatype, data.Value, data.DetectorType, subjectName)
}

// do we have an object with unknown or unknown extended properties?
Expand Down Expand Up @@ -220,31 +227,36 @@ func (classifier *Classifier) matchKnownPersonObjectPatterns(name string, matchA
return matchedPattern
}

func (classifier *Classifier) classifyKnownObject(classifiedDatatype *ClassifiedDatatype, detection *ClassificationRequestDetection, detectorType detectors.Type) *ClassifiedDatatype {
func (classifier *Classifier) classifyKnownObject(
classifiedDatatype *ClassifiedDatatype,
detection *ClassificationRequestDetection,
detectorType detectors.Type,
subjectName *string,
) *ClassifiedDatatype {
isJSDetection := classify.IsJSDetection(detectorType)

validProperties := false
for i, property := range classifiedDatatype.Properties {
if isJSDetection && classify.PropertyStopWordDetected(property.Classification.Name) {
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "stop_word")
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "stop_word", subjectName)
continue
}

matchedKnownObject := classifier.matchObjectPatterns(property.Classification.Name, detection.Properties[i].SimpleType, db.KnownObject)
if matchedKnownObject != nil {
validProperties = true
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(matchedKnownObject), "known_pattern")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(matchedKnownObject), "known_pattern", subjectName)
continue
}

matchedKnownIdentifier := classifier.matchKnownPersonObjectPatterns(normalize_key.Normalize(property.Name), true)
if matchedKnownIdentifier != nil {
validProperties = true
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], matchedKnownIdentifier.DataType, "known_database_identifier")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], matchedKnownIdentifier.DataType, "known_database_identifier", subjectName)
continue
}

classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "invalid_property")
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "invalid_property", subjectName)

}

Expand All @@ -271,32 +283,32 @@ func (classifier *Classifier) classifyKnownObject(classifiedDatatype *Classified
func (classifier *Classifier) classifyObjectWithUnknownProperties(classifiedDatatype *ClassifiedDatatype, detection *ClassificationRequestDetection, isJSDetection bool) *ClassifiedDatatype {
for i, property := range classifiedDatatype.Properties {
if isJSDetection && classify.PropertyStopWordDetected(normalize_key.Normalize(property.Name)) {
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "stop_word")
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "stop_word", nil)
continue
}

// check unknown object patterns
unknownObject := classifier.matchObjectPatterns(normalize_key.Normalize(property.Name), detection.Properties[i].SimpleType, db.UnknownObject)
if unknownObject != nil {
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(unknownObject), "valid_unknown_pattern")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(unknownObject), "valid_unknown_pattern", nil)
continue
}

// check extended patterns
extendedUnknownObject := classifier.matchObjectPatterns(normalize_key.Normalize(property.Name), detection.Properties[i].SimpleType, db.ExtendedUnknownObject)
if extendedUnknownObject != nil {
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(extendedUnknownObject), "valid_extended_pattern")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(extendedUnknownObject), "valid_extended_pattern", nil)
continue
}

// check identifier patterns
matchedKnownIdentifier := classifier.matchKnownPersonObjectPatterns(normalize_key.Normalize(property.Name), true)
if matchedKnownIdentifier != nil {
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], matchedKnownIdentifier.DataType, "known_database_identifier")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], matchedKnownIdentifier.DataType, "known_database_identifier", nil)
continue
}

classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "invalid_property")
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "invalid_property", nil)
}

classifiedDatatype.Classification.Decision = classify.ClassificationDecision{
Expand All @@ -311,24 +323,24 @@ func (classifier *Classifier) classifyObjectWithIdentifierProperties(classifiedD
associatedObjectProperties := false
for i, property := range classifiedDatatype.Properties {
if isJSDetection && classify.PropertyStopWordDetected(normalize_key.Normalize(property.Name)) {
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "stop_word")
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "stop_word", nil)
continue
}

matchedDBIdentifier := classifier.matchKnownPersonObjectPatterns(normalize_key.Normalize(property.Name), true)
if matchedDBIdentifier != nil {
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], matchedDBIdentifier.DataType, "known_database_identifier")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], matchedDBIdentifier.DataType, "known_database_identifier", nil)
continue
}

matchedAssociatedObjectPattern := classifier.matchObjectPatterns(normalize_key.Normalize(property.Name), detection.Properties[i].SimpleType, db.AssociatedObject)
if matchedAssociatedObjectPattern != nil {
associatedObjectProperties = true
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(matchedAssociatedObjectPattern), "valid_associated_object_pattern")
classifiedDatatype.Properties[i] = classifyAsValid(detection.Properties[i], classifier.datatypeFromPattern(matchedAssociatedObjectPattern), "valid_associated_object_pattern", nil)
continue
}

classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "invalid_property")
classifiedDatatype.Properties[i] = classifyAsInvalid(detection.Properties[i], "invalid_property", nil)

}

Expand Down Expand Up @@ -407,18 +419,19 @@ func classifyObjectAsInvalid(D *ClassificationRequestDetection, reason string) *

// schema object did not pass initial checks ; mark all fields as invalid
for _, property := range D.Properties {
classifiedDatatype.Properties = append(classifiedDatatype.Properties, classifyAsInvalid(property, "belongs_to_invalid_object"))
classifiedDatatype.Properties = append(classifiedDatatype.Properties, classifyAsInvalid(property, "belongs_to_invalid_object", nil))
}

return classifiedDatatype
}

func classifyAsValid(D *ClassificationRequestDetection, datatype db.DataType, reason string) *ClassifiedDatatype {
func classifyAsValid(D *ClassificationRequestDetection, datatype db.DataType, reason string, subjectName *string) *ClassifiedDatatype {
return &ClassifiedDatatype{
Name: D.Name,
Classification: Classification{
Name: normalize_key.Normalize(D.Name),
DataType: &datatype,
Name: normalize_key.Normalize(D.Name),
DataType: &datatype,
SubjectName: subjectName,
Decision: classify.ClassificationDecision{
State: classify.Valid,
Reason: reason,
Expand All @@ -427,11 +440,12 @@ func classifyAsValid(D *ClassificationRequestDetection, datatype db.DataType, re
}
}

func classifyAsInvalid(D *ClassificationRequestDetection, reason string) *ClassifiedDatatype {
func classifyAsInvalid(D *ClassificationRequestDetection, reason string, subjectName *string) *ClassifiedDatatype {
return &ClassifiedDatatype{
Name: D.Name,
Classification: Classification{
Name: normalize_key.Normalize(D.Name),
Name: normalize_key.Normalize(D.Name),
SubjectName: subjectName,
Decision: classify.ClassificationDecision{
State: classify.Invalid,
Reason: reason,
Expand Down
Loading

0 comments on commit e86b4c0

Please sign in to comment.