Skip to content

Commit

Permalink
Update example to use latest release of Tablesaw (#7669)
Browse files Browse the repository at this point in the history
  • Loading branch information
benmccann authored and LeeTZ committed Jul 11, 2018
1 parent 1e4df77 commit f011176
Showing 1 changed file with 61 additions and 82 deletions.
143 changes: 61 additions & 82 deletions doc/groovy/Tablesaw.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
"outputs": [],
"source": [
"%%classpath add mvn\n",
"tech.tablesaw tablesaw-plot 0.11.4\n",
"tech.tablesaw tablesaw-smile 0.11.4\n",
"tech.tablesaw tablesaw-beakerx 0.11.4"
"tech.tablesaw tablesaw-plot 0.21.0\n",
"tech.tablesaw tablesaw-beakerx 0.21.0\n",
"com.jimmoores quandl-tablesaw 2.0.0\n",
"com.github.haifengl smile-core 1.5.1"
]
},
{
Expand All @@ -30,11 +31,12 @@
"metadata": {},
"outputs": [],
"source": [
"%import tech.tablesaw.aggregate.*\n",
"%import static tech.tablesaw.aggregate.AggregateFunctions.*\n",
"%import static tech.tablesaw.api.QueryHelper.*\n",
"%import tech.tablesaw.api.*\n",
"%import tech.tablesaw.api.ml.clustering.*\n",
"%import tech.tablesaw.api.ml.regression.*\n",
"%import tech.tablesaw.columns.*\n",
"%import smile.clustering.*\n",
"%import smile.regression.*\n",
"\n",
"// display Tablesaw tables with BeakerX table display widget\n",
"tech.tablesaw.beakerx.TablesawDisplayer.register()"
Expand Down Expand Up @@ -95,8 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"import static tech.tablesaw.api.QueryHelper.column\n",
"tornadoes.structure().selectWhere(column(\"Column Type\").isEqualTo(\"FLOAT\"))"
"tornadoes.structure().where(stringColumn(\"Column Type\").isEqualTo(\"NUMBER\"))"
]
},
{
Expand All @@ -117,7 +118,7 @@
"source": [
"//Mapping operations\n",
"def month = tornadoes.dateColumn(\"Date\").month()\n",
"tornadoes.addColumn(month);\n",
"tornadoes.addColumns(month);\n",
"tornadoes.columnNames()"
]
},
Expand Down Expand Up @@ -148,7 +149,7 @@
"outputs": [],
"source": [
"//Performing totals and sub-totals\n",
"def injuriesByScale = tornadoes.median(\"Injuries\").by(\"Scale\")\n",
"def injuriesByScale = tornadoes.summarize(\"Injuries\", median).by(\"Scale\")\n",
"injuriesByScale.setName(\"Median injuries by Tornado Scale\")\n",
"injuriesByScale"
]
Expand All @@ -160,7 +161,8 @@
"outputs": [],
"source": [
"//Cross Tabs\n",
"CrossTab.xCount(tornadoes, tornadoes.categoryColumn(\"State\"), tornadoes.shortColumn(\"Scale\"))"
"//CrossTab.counts(tornadoes, tornadoes.stringColumn(\"State\"), tornadoes.numberColumn(\"Scale\"))\n",
"tornadoes.xTabCounts(\"State\", \"Scale\")"
]
},
{
Expand All @@ -171,9 +173,7 @@
"\n",
"K-means is the most common form of “centroid” clustering. Unlike classification, clustering is an unsupervised learning method. The categories are not predetermined. Instead, the goal is to search for natural groupings in the dataset, such that the members of each group are similar to each other and different from the members of the other groups. The K represents the number of groups to find.\n",
"\n",
"We’ll use a well known Scotch Whiskey dataset, which is used to cluster whiskeys according to their taste based on data collected from tasting notes. As always, we start by loading data and printing its structure.\n",
"\n",
"More description is available at https://jtablesaw.wordpress.com/2016/08/08/k-means-clustering-in-java/"
"We’ll use a well known Scotch Whiskey dataset, which is used to cluster whiskeys according to their taste based on data collected from tasting notes. As always, we start by loading data and printing its structure."
]
},
{
Expand All @@ -192,34 +192,7 @@
"metadata": {},
"outputs": [],
"source": [
"model = new Kmeans(\n",
" 5,\n",
" t.nCol(2), t.nCol(3), t.nCol(4), t.nCol(5), t.nCol(6), t.nCol(7),\n",
" t.nCol(8), t.nCol(9), t.nCol(10), t.nCol(11), t.nCol(12), t.nCol(13)\n",
");\n",
"\n",
"//print claster formation\n",
"model.clustered(t.column(\"Distillery\"));"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"//print centroids for each claster\n",
"model.labeledCentroids();"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"//gets the distortion for our model\n",
"model.distortion()"
"kMeans = new KMeans(t.copy().retainColumns(\"Body\", \"Sweetness\", \"Smoky\", \"Medicinal\", \"Tobacco\", \"Honey\", \"Spicy\", \"Winey\", \"Nutty\", \"Malty\", \"Fruity\", \"Floral\").asMatrix(), 5)"
]
},
{
Expand All @@ -228,20 +201,17 @@
"metadata": {},
"outputs": [],
"source": [
"def n = t.rowCount();\n",
"def kValues = new double[n - 2];\n",
"def distortions = new double[n - 2];\n",
"\n",
"for (int k = 2; k < n; k++) {\n",
" kValues[k - 2] = k;\n",
" def kmeans = new Kmeans(k,\n",
" t.nCol(2), t.nCol(3), t.nCol(4), t.nCol(5), t.nCol(6), t.nCol(7),\n",
" t.nCol(8), t.nCol(9), t.nCol(10), t.nCol(11), t.nCol(12), t.nCol(13)\n",
" );\n",
" distortions[k - 2] = kmeans.distortion();\n",
"Table table = Table.create(\"Clusters\");\n",
"StringColumn labelColumn = StringColumn.create(\"Label\");\n",
"NumberColumn clusterColumn = DoubleColumn.create(\"Cluster\");\n",
"table.addColumns(labelColumn);\n",
"table.addColumns(clusterColumn);\n",
"int[] clusters = kMeans.getClusterLabel();\n",
"for (int i = 0; i < clusters.length; i++) {\n",
" labelColumn.appendCell(t.stringColumn(\"distillery\").getString(i));\n",
" clusterColumn.append(clusters[i]);\n",
"}\n",
"def linearYPlot = new Plot(title: \"K-means clustering demo\", xLabel:\"K\", yLabel: \"distortion\")\n",
"linearYPlot << new Line(x: kValues, y: distortions)"
"table = table.sortAscendingOn(\"Cluster\", \"Label\");"
]
},
{
Expand All @@ -250,9 +220,7 @@
"source": [
"## Play (Money)ball with Linear Regression\n",
"\n",
"In baseball, you make the playoffs by winning more games than your rivals. The number of games the rivals win is out of your control so the A’s looked instead at how many wins it took historically to make the playoffs. They decided that 95 wins would give them a strong chance. Here’s how we might check that assumption in Tablesaw.\n",
"\n",
"More description is available at https://jtablesaw.wordpress.com/2016/07/31/play-moneyball-data-science-in-tablesaw/"
"In baseball, you make the playoffs by winning more games than your rivals. The number of games the rivals win is out of your control so the A’s looked instead at how many wins it took historically to make the playoffs. They decided that 95 wins would give them a strong chance. Here’s how we might check that assumption in Tablesaw."
]
},
{
Expand All @@ -261,21 +229,19 @@
"metadata": {},
"outputs": [],
"source": [
"import static tech.tablesaw.api.QueryHelper.column\n",
"\n",
"baseball = Table.read().csv(\"../resources/data/baseball.csv\");\n",
"\n",
"// filter to the data available at the start of the 2002 season\n",
"moneyball = baseball.selectWhere(column(\"year\").isLessThan(2002));\n",
"moneyball = baseball.where(numberColumn(\"year\").isLessThan(2002));\n",
"wins = moneyball.nCol(\"W\");\n",
"year = moneyball.nCol(\"Year\");\n",
"playoffs = moneyball.column(\"Playoffs\");\n",
"runDifference = moneyball.shortColumn(\"RS\").subtract(moneyball.shortColumn(\"RA\"));\n",
"moneyball.addColumn(runDifference);\n",
"runDifference = moneyball.numberColumn(\"RS\").subtract(moneyball.numberColumn(\"RA\"));\n",
"moneyball.addColumns(runDifference);\n",
"runDifference.setName(\"RD\");\n",
"\n",
"def Plot = new Plot(title: \"RD x Wins\", xLabel:\"RD\", yLabel: \"W\")\n",
"Plot << new Points(x: moneyball.numericColumn(\"RD\").toDoubleArray(), y: moneyball.numericColumn(\"W\").toDoubleArray())"
"Plot << new Points(x: moneyball.numberColumn(\"RD\").asDoubleArray(), y: moneyball.numberColumn(\"W\").asDoubleArray())"
]
},
{
Expand All @@ -284,7 +250,7 @@
"metadata": {},
"outputs": [],
"source": [
"winsModel = LeastSquares.train(wins, runDifference);"
"winsModel = new OLS(moneyball.copy().retainColumns(\"W\").asMatrix(), runDifference.asDoubleArray());"
]
},
{
Expand All @@ -293,10 +259,7 @@
"metadata": {},
"outputs": [],
"source": [
"def runDiff = new double[1];\n",
"runDiff[0] = 135;\n",
"def expectedWins = winsModel.predict(runDiff);\n",
"runsScored2 = LeastSquares.train(moneyball.nCol(\"RS\"), moneyball.nCol(\"OBP\"), moneyball.nCol(\"SLG\"));"
"runsScored = new OLS(moneyball.copy().retainColumns(\"OBP\", \"SLG\").asMatrix(), moneyball.nCol(\"RS\").asDoubleArray());"
]
},
{
Expand All @@ -307,7 +270,7 @@
"source": [
"new Histogram(xLabel:\"X\",\n",
" yLabel:\"Proportion\",\n",
" data: Arrays.asList(runsScored2.residuals()), \n",
" data: Arrays.asList(runsScored.residuals()),\n",
" binCount: 25);"
]
},
Expand All @@ -323,10 +286,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%classpath add mvn com.jimmoores quandl-tablesaw 2.0.0\n",
"%import com.jimmoores.quandl.*\n",
"%import com.jimmoores.quandl.tablesaw.*"
]
Expand All @@ -340,22 +304,22 @@
"TableSawQuandlSession session = TableSawQuandlSession.create();\n",
"Table table = session.getDataSet(DataSetRequest.Builder.of(\"WIKI/AAPL\").build());\n",
"// Create a new column containing the year\n",
"ShortColumn yearColumn = table.dateColumn(\"Date\").year();\n",
"NumberColumn yearColumn = table.dateColumn(\"Date\").year();\n",
"yearColumn.setName(\"Year\");\n",
"table.addColumn(yearColumn);\n",
"table.addColumns(yearColumn);\n",
"// Create max, min and total volume tables aggregated by year\n",
"Table summaryMax = table.groupBy(\"year\").max(\"Adj. Close\");\n",
"Table summaryMin = table.groupBy(\"year\").min(\"Adj. Close\");\n",
"Table summaryVolume = table.groupBy(\"year\")sum(\"Volume\");\n",
"Table summaryMax = table.summarize(\"Adj. Close\", max).by(\"year\");\n",
"Table summaryMin = table.summarize(\"Adj. Close\", min).by(\"year\");\n",
"Table summaryVolume = table.summarize(\"Volume\", sum).by(\"year\");\n",
"// Create a new table from each of these\n",
"summary = Table.create(\"Summary\", summaryMax.column(0), summaryMax.column(1), \n",
" summaryMin.column(1), summaryVolume.column(1));\n",
"// Add back a DateColumn to the summary...will be used for plotting\n",
"DateColumn yearDates = new DateColumn(\"YearDate\");\n",
"for(year in summary.column('Year')){\n",
" yearDates.append(java.time.LocalDate.of(year,1,1));\n",
"DateColumn yearDates = DateColumn.create(\"YearDate\");\n",
"for (year in summary.column('Year')) {\n",
" yearDates.append(java.time.LocalDate.of((int) year, 1, 1));\n",
"}\n",
"summary.addColumn(yearDates)\n",
"summary.addColumns(yearDates)\n",
"\n",
"summary"
]
Expand All @@ -378,7 +342,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
Expand All @@ -397,6 +363,19 @@
"name": "Groovy",
"nbconverter_exporter": "",
"version": "2.4.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": false,
"sideBar": false,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": false,
"toc_window_display": false
}
},
"nbformat": 4,
Expand Down

0 comments on commit f011176

Please sign in to comment.