netobserv · jotak · Mar 19, 2024 · Mar 19, 2024 · jotak · Mar 19, 2024
diff --git a/apis/flowcollector/v1beta1/flowcollector_types.go b/apis/flowcollector/v1beta1/flowcollector_types.go
@@ -505,13 +505,41 @@ type FlowCollectorFLP struct {
 	// This feature requires the "topology.kubernetes.io/zone" label to be set on nodes.
 	AddZone *bool `json:"addZone,omitempty"`
 
+	//+optional
+	// `deduper` allows to sample or drop flows identified as duplicates, in order to save on resource usage.
+	Deduper *FLPDeduper `json:"deduper,omitempty"`
+
 	// `debug` allows setting some aspects of the internal configuration of the flow processor.
 	// This section is aimed exclusively for debugging and fine-grained performance optimizations,
 	// such as `GOGC` and `GOMAXPROCS` env vars. Set these values at your own risk.
 	// +optional
 	Debug DebugConfig `json:"debug,omitempty"`
 }
 
+type FLPDeduperMode string
+
+const (
+	FLPDeduperDisabled FLPDeduperMode = "Disabled"
+	FLPDeduperDrop     FLPDeduperMode = "Drop"
+	FLPDeduperSample   FLPDeduperMode = "Sample"
+)
+
+// `FLPDeduper` defines the desired configuration for FLP-based deduper
+type FLPDeduper struct {
+	// Set the Processor deduper mode (de-duplication). It comes in addition to the Agent deduper because the Agent cannot de-duplicate same flows reported from different nodes.<br>
+	// - Use `Drop` to drop every flow considered as duplicates, allowing saving more on resource usage but potentially loosing some information such as the network interfaces used from peer.<br>
+	// - Use `Sample` to randomly keep only 1 flow on 50 (by default) among the ones considered as duplicates. This is a compromise between dropping every duplicates or keeping every duplicates. This sampling action comes in addition to the Agent-based sampling. If both Agent and Processor sampling are 50, the combined sampling is 1:2500.<br>
+	// - Use `Disabled` to turn off Processor-based de-duplication.<br>
+	// +kubebuilder:validation:Enum:="Disabled";"Drop";"Sample"
+	// +kubebuilder:default:=Disabled
+	Mode FLPDeduperMode `json:"mode,omitempty"`
+
+	// `sampling` is the sampling rate when deduper `mode` is `Sample`.
+	//+kubebuilder:validation:Minimum=0
+	//+kubebuilder:default:=50
+	Sampling int32 `json:"sampling,omitempty"`
+}
+
 const (
 	HPAStatusDisabled = "DISABLED"
 	HPAStatusEnabled  = "ENABLED"

diff --git a/apis/flowcollector/v1beta1/zz_generated.conversion.go b/apis/flowcollector/v1beta1/zz_generated.conversion.go
diff --git a/apis/flowcollector/v1beta1/zz_generated.deepcopy.go b/apis/flowcollector/v1beta1/zz_generated.deepcopy.go
diff --git a/apis/flowcollector/v1beta2/flowcollector_types.go b/apis/flowcollector/v1beta2/flowcollector_types.go
@@ -462,13 +462,41 @@ type FlowCollectorFLP struct {
 	// This feature requires the "topology.kubernetes.io/zone" label to be set on nodes.
 	AddZone *bool `json:"addZone,omitempty"`
 
+	//+optional
+	// `deduper` allows to sample or drop flows identified as duplicates, in order to save on resource usage.
+	Deduper *FLPDeduper `json:"deduper,omitempty"`
+
 	// `advanced` allows setting some aspects of the internal configuration of the flow processor.
 	// This section is aimed mostly for debugging and fine-grained performance optimizations,
 	// such as `GOGC` and `GOMAXPROCS` env vars. Set these values at your own risk.
 	// +optional
 	Advanced *AdvancedProcessorConfig `json:"advanced,omitempty"`
 }
 
+type FLPDeduperMode string
+
+const (
+	FLPDeduperDisabled FLPDeduperMode = "Disabled"
+	FLPDeduperDrop     FLPDeduperMode = "Drop"
+	FLPDeduperSample   FLPDeduperMode = "Sample"
+)
+
+// `FLPDeduper` defines the desired configuration for FLP-based deduper
+type FLPDeduper struct {
+	// Set the Processor deduper mode (de-duplication). It comes in addition to the Agent deduper because the Agent cannot de-duplicate same flows reported from different nodes.<br>
+	// - Use `Drop` to drop every flow considered as duplicates, allowing saving more on resource usage but potentially loosing some information such as the network interfaces used from peer.<br>
+	// - Use `Sample` to randomly keep only 1 flow on 50 (by default) among the ones considered as duplicates. This is a compromise between dropping every duplicates or keeping every duplicates. This sampling action comes in addition to the Agent-based sampling. If both Agent and Processor sampling are 50, the combined sampling is 1:2500.<br>
+	// - Use `Disabled` to turn off Processor-based de-duplication.<br>
+	// +kubebuilder:validation:Enum:="Disabled";"Drop";"Sample"
+	// +kubebuilder:default:=Disabled
+	Mode FLPDeduperMode `json:"mode,omitempty"`
+
+	// `sampling` is the sampling rate when deduper `mode` is `Sample`.
+	//+kubebuilder:validation:Minimum=0
+	//+kubebuilder:default:=50
+	Sampling int32 `json:"sampling,omitempty"`
+}
+
 type HPAStatus string
 
 const (

diff --git a/apis/flowcollector/v1beta2/zz_generated.deepcopy.go b/apis/flowcollector/v1beta2/zz_generated.deepcopy.go
diff --git a/bundle/manifests/flows.netobserv.io_flowcollectors.yaml b/bundle/manifests/flows.netobserv.io_flowcollectors.yaml
@@ -1873,6 +1873,38 @@ spec:
                           scenarios.'
                         type: object
                     type: object
+                  deduper:
+                    description: '`deduper` allows to sample or drop flows identified
+                      as duplicates, in order to save on resource usage.'
+                    properties:
+                      mode:
+                        default: Disabled
+                        description: Set the Processor deduper mode (de-duplication).
+                          It comes in addition to the Agent deduper because the Agent
+                          cannot de-duplicate same flows reported from different nodes.<br>
+                          - Use `Drop` to drop every flow considered as duplicates,
+                          allowing saving more on resource usage but potentially loosing
+                          some information such as the network interfaces used from
+                          peer.<br> - Use `Sample` to randomly keep only 1 flow on
+                          50 (by default) among the ones considered as duplicates.
+                          This is a compromise between dropping every duplicates or
+                          keeping every duplicates. This sampling action comes in
+                          addition to the Agent-based sampling. If both Agent and
+                          Processor sampling are 50, the combined sampling is 1:2500.<br>
+                          - Use `Disabled` to turn off Processor-based de-duplication.<br>
+                        enum:
+                        - Disabled
+                        - Drop
+                        - Sample
+                        type: string
+                      sampling:
+                        default: 50
+                        description: '`sampling` is the sampling rate when deduper
+                          `mode` is `Sample`.'
+                        format: int32
+                        minimum: 0
+                        type: integer
+                    type: object
                   dropUnusedFields:
                     default: true
                     description: '`dropUnusedFields` [deprecated (*)] this setting
@@ -4990,6 +5022,38 @@ spec:
                       in the flows data. This is useful in a multi-cluster context.
                       When using OpenShift, leave empty to make it automatically determined.'
                     type: string
+                  deduper:
+                    description: '`deduper` allows to sample or drop flows identified
+                      as duplicates, in order to save on resource usage.'
+                    properties:
+                      mode:
+                        default: Disabled
+                        description: Set the Processor deduper mode (de-duplication).
+                          It comes in addition to the Agent deduper because the Agent
+                          cannot de-duplicate same flows reported from different nodes.<br>
+                          - Use `Drop` to drop every flow considered as duplicates,
+                          allowing saving more on resource usage but potentially loosing
+                          some information such as the network interfaces used from
+                          peer.<br> - Use `Sample` to randomly keep only 1 flow on
+                          50 (by default) among the ones considered as duplicates.
+                          This is a compromise between dropping every duplicates or
+                          keeping every duplicates. This sampling action comes in
+                          addition to the Agent-based sampling. If both Agent and
+                          Processor sampling are 50, the combined sampling is 1:2500.<br>
+                          - Use `Disabled` to turn off Processor-based de-duplication.<br>
+                        enum:
+                        - Disabled
+                        - Drop
+                        - Sample
+                        type: string
+                      sampling:
+                        default: 50
+                        description: '`sampling` is the sampling rate when deduper
+                          `mode` is `Sample`.'
+                        format: int32
+                        minimum: 0
+                        type: integer
+                    type: object
                   imagePullPolicy:
                     default: IfNotPresent
                     description: '`imagePullPolicy` is the Kubernetes pull policy

diff --git a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml
@@ -743,6 +743,12 @@ spec:
         path: loki.readTimeout
       - displayName: Namespace
         path: namespace
+      - displayName: Deduper
+        path: processor.deduper
+      - displayName: Mode
+        path: processor.deduper.mode
+      - displayName: Sampling
+        path: processor.deduper.sampling
       - displayName: Log types
         path: processor.logTypes
       - displayName: Disable alerts

diff --git a/config/crd/bases/flows.netobserv.io_flowcollectors.yaml b/config/crd/bases/flows.netobserv.io_flowcollectors.yaml
@@ -1860,6 +1860,38 @@ spec:
                           scenarios.'
                         type: object
                     type: object
+                  deduper:
+                    description: '`deduper` allows to sample or drop flows identified
+                      as duplicates, in order to save on resource usage.'
+                    properties:
+                      mode:
+                        default: Disabled
+                        description: Set the Processor deduper mode (de-duplication).
+                          It comes in addition to the Agent deduper because the Agent
+                          cannot de-duplicate same flows reported from different nodes.<br>
+                          - Use `Drop` to drop every flow considered as duplicates,
+                          allowing saving more on resource usage but potentially loosing
+                          some information such as the network interfaces used from
+                          peer.<br> - Use `Sample` to randomly keep only 1 flow on
+                          50 (by default) among the ones considered as duplicates.
+                          This is a compromise between dropping every duplicates or
+                          keeping every duplicates. This sampling action comes in
+                          addition to the Agent-based sampling. If both Agent and
+                          Processor sampling are 50, the combined sampling is 1:2500.<br>
+                          - Use `Disabled` to turn off Processor-based de-duplication.<br>
+                        enum:
+                        - Disabled
+                        - Drop
+                        - Sample
+                        type: string
+                      sampling:
+                        default: 50
+                        description: '`sampling` is the sampling rate when deduper
+                          `mode` is `Sample`.'
+                        format: int32
+                        minimum: 0
+                        type: integer
+                    type: object
                   dropUnusedFields:
                     default: true
                     description: '`dropUnusedFields` [deprecated (*)] this setting
@@ -4977,6 +5009,38 @@ spec:
                       in the flows data. This is useful in a multi-cluster context.
                       When using OpenShift, leave empty to make it automatically determined.'
                     type: string
+                  deduper:
+                    description: '`deduper` allows to sample or drop flows identified
+                      as duplicates, in order to save on resource usage.'
+                    properties:
+                      mode:
+                        default: Disabled
+                        description: Set the Processor deduper mode (de-duplication).
+                          It comes in addition to the Agent deduper because the Agent
+                          cannot de-duplicate same flows reported from different nodes.<br>
+                          - Use `Drop` to drop every flow considered as duplicates,
+                          allowing saving more on resource usage but potentially loosing
+                          some information such as the network interfaces used from
+                          peer.<br> - Use `Sample` to randomly keep only 1 flow on
+                          50 (by default) among the ones considered as duplicates.
+                          This is a compromise between dropping every duplicates or
+                          keeping every duplicates. This sampling action comes in
+                          addition to the Agent-based sampling. If both Agent and
+                          Processor sampling are 50, the combined sampling is 1:2500.<br>
+                          - Use `Disabled` to turn off Processor-based de-duplication.<br>
+                        enum:
+                        - Disabled
+                        - Drop
+                        - Sample
+                        type: string
+                      sampling:
+                        default: 50
+                        description: '`sampling` is the sampling rate when deduper
+                          `mode` is `Sample`.'
+                        format: int32
+                        minimum: 0
+                        type: integer
+                    type: object
                   imagePullPolicy:
                     default: IfNotPresent
                     description: '`imagePullPolicy` is the Kubernetes pull policy