A way to store and manipulate data
The library exposes in-memory storage for dynamically typed data. The storage is represented by DataFrame class.
- Usage example
- DataFrame API
- Get the header
- Get the rows
- Get the series
- Get the shape
- Add a series
- Drop a series by a name
- Drop a series by an index
- Sample a dataframe from rows
- Sample a dataframe from series indices
- Sample a dataframe from series names
- Save a dataframe
- Shuffle rows of a dataframe
- Get a JSON representation
- Convert to Matrix
- Get a series by name
- Get a series by index
- Map values
- Map values of a series
- Ways to create a dataframe
- Prefilled dataframes
- Contacts
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = [
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
];
final dataframe = DataFrame(data);
print(dataframe);
// DataFrame (5 x 6)
// Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
// 1 5.1 3.5 1.4 0.2 Iris-setosa
// 2 4.9 3.0 1.4 0.2 Iris-setosa
// 89 5.6 3.0 4.1 1.3 Iris-versicolor
// 90 5.5 2.5 4.0 1.3 Iris-versicolor
// 91 5.5 2.6 4.4 1.2 Iris-versicolor
}
By default, the very first row is considered a header, unless one specify their own header or autogenerated one. More on this is here
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final header = dataframe.header;
print(header);
// ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final rows = dataframe.rows;
print(rows);
// [
// [1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
// [2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
// [89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
// [90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
// [91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
// ],
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final series = dataframe.series;
print(series);
// [
// 'Id': [1, 2, 89, 90, 91],
// 'SepalLengthCm': [5.1, 4.9, 5.6, 5.5, 5.5],
// 'SepalWidthCm': [3.5, 3.0, 3.0, 2.5, 2.6],
// 'PetalLengthCm': [1.4, 1.4, 4.1, 4.0, 4.4],
// 'PetalWidthCm': [0.2, 0.2, 1.3, 1.3, 1.2],
// 'Species': ['Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor'],
// ],
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final shape = dataframe.shape;
print(shape);
// [5, 6] - 5 rows, 6 columns
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final firstSeries = Series('super_series', [1, 2, 3, 4, 5, 6]);
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final modifiedDataframe = dataframe.addSeries([firstSeries]); // The method doesn't mutate the original dataframe
print(modifiedDataframe.series.first);
// 'super_series': [1, 2, 3, 4, 5, 6]
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
print(dataframe.shape);
// [5, 6] - 6 rows, 6 columns
final modifiedDataframe = dataframe.dropSeries(names: ['Id']); // The method doesn't mutate the original dataframe
print(modifiedDataframe.shape);
// [5, 5] - after a series had been dropped, the number of columns became one lesser
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
print(dataframe.shape);
// [5, 6] - 5 rows, 6 columns
final modifiedDataframe = dataframe.dropSeries(indices: [0]); // The method doesn't mutate the original dataframe
print(modifiedDataframe.shape);
// [5, 5] - after a series had been dropped, the number of columns became one lesser
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final sampled = dataframe.sampleFromRows([0, 5]);
print(sampled);
// DataFrame (2 x 6)
// Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
// 1 5.1 3.5 1.4 0.2 Iris-setosa
// 91 5.5 2.6 4.4 1.2 Iris-versicolor
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final sampled = dataframe.sampleFromSeries(indices: [0, 1]);
print(sampled);
// DataFrame (5 x 2)
// Id SepalLengthCm
// 1 5.1
// 2 4.9
// 89 5.6
// 90 5.5
// 91 5.5
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final sampled = dataframe.sampleFromSeries(names: ['Id', 'SepalLengthCm']);
print(sampled);
// DataFrame (5 x 2)
// Id SepalLengthCm
// 1 5.1
// 2 4.9
// 89 5.6
// 90 5.5
// 91 5.5
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() async {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
await dataframe.saveAsJson('path/to/json/file.json');
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
print(dataframe);
// DataFrame (5 x 6)
// Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
// 1 5.1 3.5 1.4 0.2 Iris-setosa
// 2 4.9 3.0 1.4 0.2 Iris-setosa
// 89 5.6 3.0 4.1 1.3 Iris-versicolor
// 90 5.5 2.5 4.0 1.3 Iris-versicolor
// 91 5.5 2.6 4.4 1.2 Iris-versicolor
final shuffled = dataframe.shuffle(); // keep in mind that `shuffle` like other methods returns a new dataframe, the method doesn't mutate the source dataframe
print(shuffled);
// DataFrame (5 x 6)
// Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
// 89 5.6 3.0 4.1 1.3 Iris-versicolor
// 1 5.1 3.5 1.4 0.2 Iris-setosa
// 91 5.5 2.6 4.4 1.2 Iris-versicolor
// 2 4.9 3.0 1.4 0.2 Iris-setosa
// 90 5.5 2.5 4.0 1.3 Iris-versicolor
}
One can use seed
parameter to keep the order of rows disregard the number of shuffle
calls:
dataframe.shuffle(seed: 10);
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final json = dataframe.toJson(); // json contains a serializable map
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'],
[ 1, 5.1, 3.5, 1.4, 0.2],
[ 2, 4.9, 3.0, 1.4, 0.2],
[ 89, 5.6, 3.0, 4.1, 1.3],
[ 90, 5.5, 2.5, 4.0, 1.3],
[ 91, 5.5, 2.6, 4.4, 1.2],
]);
final matrix = dataframe.toMatrix();
print(matrix); // because of internal representation of Float32 numbers there are some round-off errors in the output
// Matrix 5 x 5:
// (1.0, 5.099999904632568, 3.5, 1.399999976158142, 0.20000000298023224)
// (2.0, 4.900000095367432, 3.0, 1.399999976158142, 0.20000000298023224)
// (89.0, 5.599999904632568, 3.0, 4.099999904632568, 1.2999999523162842)
// (90.0, 5.5, 2.5, 4.0, 1.2999999523162842)
// (91.0, 5.5, 2.5999999046325684, 4.400000095367432, 1.2000000476837158)
}
the method throws an error if there are inconvertible to a number values in the dataframe.
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final series = dataframe[0];
print(series);
// Id: [1, 2, 89, 90, 91]
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final dataframe = DataFrame([
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
]);
final series = dataframe['Id'];
print(series);
// Id: [1, 2, 89, 90, 91]
}
import 'package:ml_dataframe/ml_dataframe';
void main() {
final data = DataFrame([
['col_1', 'col_2', 'col_3'],
[ 2, 20, 200],
[ 3, 30, 300],
[ 4, 40, 400],
]);
// the first generic type ia a type of the source value, the second generic type is a type of the mapped value
final modifiedData = data.map<num, num>((value) => value * 2);
print(modifiedData);
// DataFrame (3 x 3)
// col_1 col_2 col_3
// 4 40 400
// 6 60 600
// 8 80 800
}
import 'package:ml_dataframe/ml_dataframe';
void main() {
final data = DataFrame([
['col_1', 'col_2', 'col_3'],
[ 2, 20, 200],
[ 3, 30, 300],
[ 4, 40, 400],
]);
// the first generic type ia a type of the source value, the second generic type is a type of the mapped value
final modifiedData = data.mapSeries<num, num>((value) => value * 2, name: 'col_2');
print(modifiedData);
// DataFrame (3 x 3)
// col_1 col_2 col_3
// 2 40 200
// 3 60 300
// 4 80 400
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = [
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
[ 1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[ 2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[ 89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[ 90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[ 91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
];
final dataframe = DataFrame(data);
}
By default, the very first row is considered a header. If the data does not have a header, one can use autogenerated
header by providing headerExists: false
config to the constructor:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = [
[1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
];
final dataframe = DataFrame(data, headerExists: false);
print(dataframe.header);
}
It outputs ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6']
. col_
is a default prefix for the autogenerated
columns.
Also, if there are no header row in the data, one can use a predefined header:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = [
[1, 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
[2, 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
[89, 5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'],
[90, 5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'],
[91, 5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'],
];
final dataframe = DataFrame(data, header: ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6']);
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() async {
final data = await fromCsv('path/to/csv/file.csv');
}
If the csv
file does not have a header row, it's needed to provide the corresponding flag:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() async {
final data = await fromCsv('path/to/csv/file.csv', headerExists: false);
}
import 'package:ml_dataframe/ml_dataframe.dart';
void main() async {
final data = await fromJson('path/to/json/file.json');
}
This function works in conjunction with DataFrame saveAsJson
method.
In order to test data processing algorithms, one can use "toy" datasets. The library exposes several of them:
One can create a dataframe filled with Iris data:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = getIrisDataFrame();
print(data);
// DataFrame (150 x 6)
// Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
// ...
}
One can create a dataframe filled with Pima Indians diabetes data:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = getPimaIndiansDiabetesDataFrame();
print(data);
// DataFrame (768 x 9)
// Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
// ...
}
One can create a dataframe filled with Red wine quality data:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = getWineQualityDataframe();
print(data);
// DataFrame (1599 x 12)
// fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
// ...
}
One can create a dataframe filled with Boston housing data:
import 'package:ml_dataframe/ml_dataframe.dart';
void main() {
final data = getHousingDataframe();
print(data);
// DataFrame (506 x 14)
// CRIM ZN INDUS CHAS NOX RM ... MEDV
// 0.00632 18.0 2.31 0 0.538 6.575 ... 24.0
// 0.02731 0.0 7.07 0 0.469 6.421 ... 21.6
// 0.02729 0.0 7.07 0 0.469 7.185 ... 34.7
// 0.03237 0.0 2.18 0 0.458 6.998 ... 33.4
// 0.06905 0.0 2.18 0 0.458 7.147 ... 36.2
// ... ... ... ... ... ... ... ...
// 0.06263 0.0 11.93 0 0.573 6.593 ... 22.4
// 0.04527 0.0 11.93 0 0.573 6.12 ... 20.6
// 0.06076 0.0 11.93 0 0.573 6.976 ... 23.9
// 0.10959 0.0 11.93 0 0.573 6.794 ... 22.0
// 0.04741 0.0 11.93 0 0.573 6.03 ... 11.9
}
If you have questions, feel free to text me on