Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new shuffling support from data frame to partition a data set #5

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
Class {
#name : #AIHashPartitionerDataFrameTest,
#superclass : #TestCase,
#category : #'AI-DataPartitioners-DataFrameTests'
#name : 'AIHashPartitionerDataFrameTest',
#superclass : 'TestCase',
#category : 'AI-DataPartitioners-DataFrameTests',
#package : 'AI-DataPartitioners-DataFrameTests'
}

{ #category : #tests }
{ #category : 'tests' }
AIHashPartitionerDataFrameTest >> testPartitionDataFrameWithCustomRowNames [

| dataFrame proportions subsets |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
Class {
#name : #AIRandomPartitionerDataFrameTest,
#superclass : #TestCase,
#category : #'AI-DataPartitioners-DataFrameTests'
#name : 'AIRandomPartitionerDataFrameTest',
#superclass : 'TestCase',
#instVars : [
'df'
],
#category : 'AI-DataPartitioners-DataFrameTests',
#package : 'AI-DataPartitioners-DataFrameTests'
}

{ #category : #tests }
{ #category : 'running' }
AIRandomPartitionerDataFrameTest >> setUp [

super setUp.
df := DataFrame withRows: #( #( 'Barcelona' 1.609 true ) #( 'Dubai' 2.789 true ) #( 'London' 8.788 false ) ).

df rowNames: #( 'A' 'B' 'C' ).
df columnNames: #( 'City' 'Population' 'BeenThere' )
]

{ #category : 'tests' }
AIRandomPartitionerDataFrameTest >> testPartitionDataFrameWithCustomRowNames [
| dataFrame sizes partitioner subsets |

Expand All @@ -30,7 +44,7 @@ AIRandomPartitionerDataFrameTest >> testPartitionDataFrameWithCustomRowNames [
self assert: (dataFrame row: rowName) equals: (eachSubset row: rowName) ] ].
]

{ #category : #tests }
{ #category : 'tests' }
AIRandomPartitionerDataFrameTest >> testPartitionDataSeriesWithCustomKeys [
| series sizes partitioner subsets |

Expand All @@ -54,7 +68,7 @@ AIRandomPartitionerDataFrameTest >> testPartitionDataSeriesWithCustomKeys [
self assert: key equals: (series keyAtValue: value) ] ].
]

{ #category : #tests }
{ #category : 'tests' }
AIRandomPartitionerDataFrameTest >> testPartitionDataSeriesWithDefaultKeys [
| series sizes partitioner subsets |

Expand All @@ -76,3 +90,38 @@ AIRandomPartitionerDataFrameTest >> testPartitionDataSeriesWithDefaultKeys [
eachSubset keysAndValuesDo: [ :key :value |
self assert: key equals: (series keyAtValue: value) ] ].
]

{ #category : 'tests' }
AIRandomPartitionerDataFrameTest >> testSplitTrainTestFromUsingTargetColumnWithProportionsShuffle [

| expectedPartition partitionedDataSet |

expectedPartition := AIPartitionedDataSet new
xTrain: (DataFrame
withRows: #( #( 'Barcelona' 1.609 ) #( 'London' 8.788 ))
rowNames: #('A' 'C')
columnNames: #( 'City' 'Population' ));
xTest: (DataFrame
withRows: #( #( 'Dubai' 2.789 ))
rowNames: #('B')
columnNames: #( 'City' 'Population' ));
yTrain: (DataFrame
withRows: #( #( true ) #( false ))
rowNames: #('A' 'C')
columnNames: #( 'BeenThere' ));
yTest: (DataFrame
withRows: #( #( true ))
rowNames: #('B')
columnNames: #( 'BeenThere' ));
yourself.

partitionedDataSet := (AIRandomPartitioner new
splitTrainTestFrom: df
usingTargetColumn: #('BeenThere')
withProportions: #(0.7 0.3)
seed: 1).

self
assert: partitionedDataSet
equals: expectedPartition
]
2 changes: 1 addition & 1 deletion src/AI-DataPartitioners-DataFrameTests/package.st
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Package { #name : #'AI-DataPartitioners-DataFrameTests' }
Package { #name : 'AI-DataPartitioners-DataFrameTests' }
9 changes: 5 additions & 4 deletions src/AI-DataPartitioners-Tests/AIHashPartitionerTest.class.st
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
Class {
#name : #AIHashPartitionerTest,
#superclass : #TestCase,
#category : #'AI-DataPartitioners-Tests'
#name : 'AIHashPartitionerTest',
#superclass : 'TestCase',
#category : 'AI-DataPartitioners-Tests',
#package : 'AI-DataPartitioners-Tests'
}

{ #category : #tests }
{ #category : 'tests' }
AIHashPartitionerTest >> testSplitDataWithProportionsCase1 [

| data proportions expectedSizes subsets subsetsSizes |
Expand Down
19 changes: 12 additions & 7 deletions src/AI-DataPartitioners-Tests/AIRandomPartitionerTest.class.st
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
Class {
#name : #AIRandomPartitionerTest,
#superclass : #TestCase,
#category : #'AI-DataPartitioners-Tests'
#name : 'AIRandomPartitionerTest',
#superclass : 'TestCase',
#instVars : [
'partitioner',
'df'
],
#category : 'AI-DataPartitioners-Tests',
#package : 'AI-DataPartitioners-Tests'
}

{ #category : #tests }
{ #category : 'tests' }
AIRandomPartitionerTest >> testSplitDataWithProportionsCase1 [

| data proportions expectedSizes subsets subsetsSizes |
Expand All @@ -20,7 +25,7 @@ AIRandomPartitionerTest >> testSplitDataWithProportionsCase1 [
self assert: subsetsSizes equals: expectedSizes
]

{ #category : #tests }
{ #category : 'tests' }
AIRandomPartitionerTest >> testSplitDataWithProportionsCase2 [

| data proportions expectedSizes subsets subsetsSizes |
Expand All @@ -36,7 +41,7 @@ AIRandomPartitionerTest >> testSplitDataWithProportionsCase2 [
self assert: subsetsSizes equals: expectedSizes
]

{ #category : #tests }
{ #category : 'tests' }
AIRandomPartitionerTest >> testSplitDataWithSizesCase1 [

| data sizes subsets subsetsSizes |
Expand All @@ -51,7 +56,7 @@ AIRandomPartitionerTest >> testSplitDataWithSizesCase1 [
self assert: subsetsSizes equals: sizes
]

{ #category : #tests }
{ #category : 'tests' }
AIRandomPartitionerTest >> testSplitDataWithSizesCase2 [

| data sizes subsets subsetsSizes |
Expand Down
2 changes: 1 addition & 1 deletion src/AI-DataPartitioners-Tests/package.st
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Package { #name : #'AI-DataPartitioners-Tests' }
Package { #name : 'AI-DataPartitioners-Tests' }
11 changes: 6 additions & 5 deletions src/AI-DataPartitioners/AIHashPartitioner.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,19 @@ subsets := AIHashPartitioner split: dataFrame withProportions: #(0.8 0.2).
```
"
Class {
#name : #AIHashPartitioner,
#superclass : #Object,
#category : #'AI-DataPartitioners'
#name : 'AIHashPartitioner',
#superclass : 'Object',
#category : 'AI-DataPartitioners',
#package : 'AI-DataPartitioners'
}

{ #category : #api }
{ #category : 'api' }
AIHashPartitioner class >> split: aCollection withProportions: aCollectionOfProportions [

^ self new split: aCollection withProportions: aCollectionOfProportions
]

{ #category : #api }
{ #category : 'api' }
AIHashPartitioner >> split: aCollection withProportions: aCollectionOfProportions [

| cumulativeProportionsMap indexesMap |
Expand Down
101 changes: 101 additions & 0 deletions src/AI-DataPartitioners/AIPartitionedDataSet.class.st
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"
Encapsulates the result of a train-test split, holding four variables:

# Attributes

- xTrain `DataFrame`: Training features.
- xTest `DataFrame`: Testing features.
- yTrain `DataFrame` or `DataSeries`: Training target variable.
- yTest `DataFrame` or `DataSeries`: Testing target variable.

This class is designed to provide a structured representation of the result obtained from splitting a dataset into training and testing sets, making it convenient to manage and access the different subsets of data during machine learning workflows.

"
Class {
#name : 'AIPartitionedDataSet',
#superclass : 'Object',
#instVars : [
'xTrain',
'xTest',
'yTrain',
'yTest'
],
#category : 'AI-DataPartitioners',
#package : 'AI-DataPartitioners'
}

{ #category : 'comparing' }
AIPartitionedDataSet >> = aPartitionedDataSet [

self == aPartitionedDataSet
ifTrue: [ ^ true ].
self class = aPartitionedDataSet class
ifFalse: [ ^ false ].
self xTest = aPartitionedDataSet xTest
ifFalse: [ ^ false ].
self yTest = aPartitionedDataSet yTest
ifFalse: [ ^ false ].
self xTrain = aPartitionedDataSet xTrain
ifFalse: [ ^ false ].
self yTrain = aPartitionedDataSet yTrain
ifFalse: [ ^ false ].
^ true
]

{ #category : 'comparing' }
AIPartitionedDataSet >> hash [
"hash is implemented because #= is implemented"

^ self species hash
bitXor: (self xTest hash
bitXor: (self xTrain hash bitXor: (self yTest hash
bitXor: self yTrain hash)))
]

{ #category : 'accessing' }
AIPartitionedDataSet >> xTest [

^ xTest
]

{ #category : 'accessing' }
AIPartitionedDataSet >> xTest: anObject [

xTest := anObject
]

{ #category : 'accessing' }
AIPartitionedDataSet >> xTrain [

^ xTrain
]

{ #category : 'accessing' }
AIPartitionedDataSet >> xTrain: anObject [

xTrain := anObject
]

{ #category : 'accessing' }
AIPartitionedDataSet >> yTest [

^ yTest
]

{ #category : 'accessing' }
AIPartitionedDataSet >> yTest: anObject [

yTest := anObject
]

{ #category : 'accessing' }
AIPartitionedDataSet >> yTrain [

^ yTrain
]

{ #category : 'accessing' }
AIPartitionedDataSet >> yTrain: anObject [

yTrain := anObject
]
Loading
Loading