-
Notifications
You must be signed in to change notification settings - Fork 158
/
note.json
1 lines (1 loc) · 13.2 KB
/
note.json
1
{"paragraphs":[{"title":"Download spending dataset into HDFS","text":"%md\n## Sample SparkSQL notebook\n#### Sample Spark notebook using public Australian Dataset to see where tax payer money is being invested\nby [Ned Shawa](https://twitter.com/nedshawa)\n","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"tableHide":false,"title":false,"editorHide":true,"enabled":true,"editorMode":"ace/mode/markdown"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432331641199_1886957287","id":"20150522-145401_1470736167","result":{"code":"SUCCESS","type":"HTML","msg":"<h2>Sample SparkSQL notebook</h2>\n<h4>Sample Spark notebook using public Australian Dataset to see where tax payer money is being invested</h4>\n<p>by <a href=\"https://twitter.com/nedshawa\">Ned Shawa</a></p>\n"},"dateCreated":"2015-05-22T02:54:01+0000","dateStarted":"2015-05-24T04:59:07+0000","dateFinished":"2015-05-24T04:59:07+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:649"},{"title":"Download data and import into HDFS","text":"%sh\n\n#remove existing copies of dataset from HDFS\nhadoop fs -rm /tmp/expenses.csv\n\n#fetch the dataset\nwget https://data.gov.au/dataset/f84b9baf-c1c1-437c-8c1e-654b2829848c/resource/88399d53-d55c-466c-8f4a-6cb965d24d6d/download/healthexpenditurebyareaandsource.csv -O /tmp/expenses.csv\n\n#remove header\nsed -i '1d' /tmp/expenses.csv\n#remove empty fields\nsed -i \"s/,,,,,//g\" /tmp/expenses.csv\nsed -i '/^\\s*$/d' /tmp/expenses.csv\n\n#put data into HDFS\nhadoop fs -put /tmp/expenses.csv /tmp\nhadoop fs -ls -h /tmp/expenses.csv\nrm /tmp/expenses.csv","dateUpdated":"2015-09-16T12:12:42+0000","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"title":true,"editorMode":"ace/mode/sh","enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432436102907_-1607474083","id":"20150524-025502_1843984349","result":{"code":"SUCCESS","type":"TEXT","msg":"rm: `/tmp/expenses.csv': No such file or directory\n--2015-09-16 00:12:44-- https://data.gov.au/dataset/f84b9baf-c1c1-437c-8c1e-654b2829848c/resource/88399d53-d55c-466c-8f4a-6cb965d24d6d/download/healthexpenditurebyareaandsource.csv\nResolving data.gov.au... 54.252.191.23\nConnecting to data.gov.au|54.252.191.23|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 469727 (459K) [text/csv]\nSaving to: “/tmp/expenses.csv”\n\n 0K .......... .......... .......... .......... .......... 10% 86.1K 5s\n 50K .......... .......... .......... .......... .......... 21% 256K 3s\n 100K .......... .......... .......... .......... .......... 32% 258K 2s\n 150K .......... .......... .......... .......... .......... 43% 261K 2s\n 200K .......... .......... .......... .......... .......... 54% 9.05M 1s\n 250K .......... .......... .......... .......... .......... 65% 264K 1s\n 300K .......... .......... .......... .......... .......... 76% 2.88M 0s\n 350K .......... .......... .......... .......... .......... 87% 281K 0s\n 400K .......... .......... .......... .......... .......... 98% 4.79M 0s\n 450K ........ 100% 1.89M=1.6s\n\n2015-09-16 00:12:47 (293 KB/s) - “/tmp/expenses.csv” saved [469727/469727]\n\n-rw-r--r-- 1 zeppelin hdfs 456.4 K 2015-09-16 00:12 /tmp/expenses.csv\n"},"dateCreated":"2015-05-24T02:55:02+0000","dateStarted":"2015-09-16T12:12:42+0000","dateFinished":"2015-09-16T12:12:51+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:650"},{"title":"Read csv into RDD and count","text":"\nval dataset=sc.textFile(\"/tmp/expenses.csv\")\ndataset.count()\ndataset.first()","dateUpdated":"2015-09-16T12:12:54+0000","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"title":true,"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432331706188_1973803269","id":"20150522-145506_1520223608","result":{"code":"SUCCESS","type":"TEXT","msg":"dataset: org.apache.spark.rdd.RDD[String] = /tmp/expenses.csv MapPartitionsRDD[1] at textFile at <console>:24\nres1: Long = 6778\nres2: String = 1997-98,NSW,Administration,Government,Australian Government,315\n"},"dateCreated":"2015-05-22T02:55:06+0000","dateStarted":"2015-09-16T12:12:54+0000","dateFinished":"2015-09-16T12:13:41+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:651"},{"title":"Register RDD as table","text":"\ncase class Health (year: String, state: String, category:String, funding_src1: String, funding_scr2: String, spending: Integer)\nval health = dataset.map(k=>k.split(\",\")).map(\n k => Health(k(0),k(1),k(2),k(3), k(4), k(5).toInt)\n )\n// toDF() works only in spark 1.3.0.\n// For spark 1.1.x and spark 1.2.x,\n// use below instead:\n// health.registerTempTable(\"health_table\")\nhealth.toDF().registerTempTable(\"health_table\")","dateUpdated":"2015-09-16T12:13:51+0000","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"title":true,"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432331754078_156265313","id":"20150522-145554_1659286336","result":{"code":"SUCCESS","type":"TEXT","msg":"defined class Health\nhealth: org.apache.spark.rdd.RDD[Health] = MapPartitionsRDD[3] at map at <console>:27\n"},"dateCreated":"2015-05-22T02:55:54+0000","dateStarted":"2015-09-16T12:13:51+0000","dateFinished":"2015-09-16T12:13:55+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:652"},{"title":"Spending (in billions) by state","text":"%sql\nselect state, sum(spending)/1000 SpendinginBillions \nfrom health_table \ngroup by state \norder by SpendinginBillions desc","dateUpdated":"2015-09-16T12:13:58+0000","config":{"colWidth":4,"graph":{"mode":"pieChart","height":300,"optionOpen":false,"keys":[{"name":"state","index":0,"aggr":"sum"}],"values":[{"name":"SpendinginBillions","index":1,"aggr":"sum"}],"groups":[],"scatter":{"xAxis":{"name":"state","index":0,"aggr":"sum"},"yAxis":{"name":"SpendinginBillions","index":1,"aggr":"sum"}}},"editorHide":false,"title":true,"tableHide":false,"editorMode":"ace/mode/sql","enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432331827930_795256615","id":"20150522-145707_2030296925","result":{"code":"SUCCESS","type":"TABLE","msg":"state\tSpendinginBillions\nNSW\t482.366\nVIC\t360.217\nQLD\t289.676\nWA\t149.887\nSA\t114.623\nTAS\t34.294\nACT\t28.26\nNT\t18.915\n","comment":"","msgTable":[[{"key":"SpendinginBillions","value":"NSW"},{"key":"SpendinginBillions","value":"482.366"}],[{"value":"VIC"},{"value":"360.217"}],[{"value":"QLD"},{"value":"289.676"}],[{"value":"WA"},{"value":"149.887"}],[{"value":"SA"},{"value":"114.623"}],[{"value":"TAS"},{"value":"34.294"}],[{"value":"ACT"},{"value":"28.26"}],[{"value":"NT"},{"value":"18.915"}]],"columnNames":[{"name":"state","index":0,"aggr":"sum"},{"name":"SpendinginBillions","index":1,"aggr":"sum"}],"rows":[["NSW","482.366"],["VIC","360.217"],["QLD","289.676"],["WA","149.887"],["SA","114.623"],["TAS","34.294"],["ACT","28.26"],["NT","18.915"]]},"dateCreated":"2015-05-22T02:57:07+0000","dateStarted":"2015-09-16T12:13:58+0000","dateFinished":"2015-09-16T12:14:05+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:653"},{"title":"Spending (In Billions) By Year","text":"%sql\nselect year,sum(spending)/1000 SpendinginBillions \nfrom health_table \ngroup by year \norder by SpendinginBillions\n","dateUpdated":"2015-09-16T12:14:07+0000","config":{"colWidth":4,"graph":{"mode":"multiBarChart","height":300,"optionOpen":false,"keys":[{"name":"year","index":0,"aggr":"sum"}],"values":[{"name":"SpendinginBillions","index":1,"aggr":"sum"}],"groups":[],"scatter":{"xAxis":{"name":"year","index":0,"aggr":"sum"},"yAxis":{"name":"SpendinginBillions","index":1,"aggr":"sum"}}},"editorHide":false,"title":true,"tableHide":false,"editorMode":"ace/mode/sql","enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432332006302_1582730344","id":"20150522-150006_2004946034","result":{"code":"SUCCESS","type":"TABLE","msg":"year\tSpendinginBillions\n1997-98\t65.705\n1998-99\t69.54\n1999-00\t73.084\n2000-01\t78.687\n2001-02\t82.895\n2002-03\t87.702\n2003-04\t90.617\n2004-05\t96.503\n2005-06\t99.076\n2006-07\t104.979\n2007-08\t112.106\n2008-09\t119.757\n2009-10\t124.767\n2010-11\t132.572\n2011-12\t140.248\n","comment":"","msgTable":[[{"key":"SpendinginBillions","value":"1997-98"},{"key":"SpendinginBillions","value":"65.705"}],[{"value":"1998-99"},{"value":"69.54"}],[{"value":"1999-00"},{"value":"73.084"}],[{"value":"2000-01"},{"value":"78.687"}],[{"value":"2001-02"},{"value":"82.895"}],[{"value":"2002-03"},{"value":"87.702"}],[{"value":"2003-04"},{"value":"90.617"}],[{"value":"2004-05"},{"value":"96.503"}],[{"value":"2005-06"},{"value":"99.076"}],[{"value":"2006-07"},{"value":"104.979"}],[{"value":"2007-08"},{"value":"112.106"}],[{"value":"2008-09"},{"value":"119.757"}],[{"value":"2009-10"},{"value":"124.767"}],[{"value":"2010-11"},{"value":"132.572"}],[{"value":"2011-12"},{"value":"140.248"}]],"columnNames":[{"name":"year","index":0,"aggr":"sum"},{"name":"SpendinginBillions","index":1,"aggr":"sum"}],"rows":[["1997-98","65.705"],["1998-99","69.54"],["1999-00","73.084"],["2000-01","78.687"],["2001-02","82.895"],["2002-03","87.702"],["2003-04","90.617"],["2004-05","96.503"],["2005-06","99.076"],["2006-07","104.979"],["2007-08","112.106"],["2008-09","119.757"],["2009-10","124.767"],["2010-11","132.572"],["2011-12","140.248"]]},"dateCreated":"2015-05-22T03:00:06+0000","dateStarted":"2015-09-16T12:14:07+0000","dateFinished":"2015-09-16T12:14:10+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:654"},{"title":"Spending (in billions) by area","text":"%sql\nselect category, sum(spending)/1000 SpendinginBillions \nfrom health_table \ngroup by category \norder by SpendinginBillions desc","dateUpdated":"2015-09-16T12:14:11+0000","config":{"colWidth":4,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[{"name":"category","index":0,"aggr":"sum"}],"values":[{"name":"SpendinginBillions","index":1,"aggr":"sum"}],"groups":[],"scatter":{"xAxis":{"name":"category","index":0,"aggr":"sum"},"yAxis":{"name":"SpendinginBillions","index":1,"aggr":"sum"}}},"editorHide":false,"title":true,"tableHide":false,"editorMode":"ace/mode/sql","enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432332063882_-271146387","id":"20150522-150103_460493658","result":{"code":"SUCCESS","type":"TABLE","msg":"category\tSpendinginBillions\nPublic hospitals\t445.845\nMedical services\t272.507\nPrivate hospitals\t121.022\nBenefit-paid pharmaceuticals\t104.221\nDental services\t90.786\nCommunity health\t75.765\nCapital expenditure\t72.698\nAll other medications\t70.508\nOther health practitioners\t51.382\nAdministration\t41.029\nResearch\t40.074\nAids and appliances\t37.155\nPatient transport services\t28.174\nPublic health\t27.072\nMedical expense tax rebate\t0.0\n","comment":"","msgTable":[[{"key":"SpendinginBillions","value":"Public hospitals"},{"key":"SpendinginBillions","value":"445.845"}],[{"value":"Medical services"},{"value":"272.507"}],[{"value":"Private hospitals"},{"value":"121.022"}],[{"value":"Benefit-paid pharmaceuticals"},{"value":"104.221"}],[{"value":"Dental services"},{"value":"90.786"}],[{"value":"Community health"},{"value":"75.765"}],[{"value":"Capital expenditure"},{"value":"72.698"}],[{"value":"All other medications"},{"value":"70.508"}],[{"value":"Other health practitioners"},{"value":"51.382"}],[{"value":"Administration"},{"value":"41.029"}],[{"value":"Research"},{"value":"40.074"}],[{"value":"Aids and appliances"},{"value":"37.155"}],[{"value":"Patient transport services"},{"value":"28.174"}],[{"value":"Public health"},{"value":"27.072"}],[{"value":"Medical expense tax rebate"},{"value":"0.0"}]],"columnNames":[{"name":"category","index":0,"aggr":"sum"},{"name":"SpendinginBillions","index":1,"aggr":"sum"}],"rows":[["Public hospitals","445.845"],["Medical services","272.507"],["Private hospitals","121.022"],["Benefit-paid pharmaceuticals","104.221"],["Dental services","90.786"],["Community health","75.765"],["Capital expenditure","72.698"],["All other medications","70.508"],["Other health practitioners","51.382"],["Administration","41.029"],["Research","40.074"],["Aids and appliances","37.155"],["Patient transport services","28.174"],["Public health","27.072"],["Medical expense tax rebate","0.0"]]},"dateCreated":"2015-05-22T03:01:03+0000","dateStarted":"2015-09-16T12:14:11+0000","dateFinished":"2015-09-16T12:14:14+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:655"},{"config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1432332860788_-828901477","id":"20150522-151420_2114334426","dateCreated":"2015-05-22T03:14:20+0000","status":"READY","progressUpdateIntervalMs":500,"$$hashKey":"object:656"}],"name":"Demos / Spark / Australian Dataset","id":"2ANTDG878","angularObjects":{},"config":{"looknfeel":"default"},"info":{}}