diff --git a/build.gradle b/build.gradle index df5ddcc1..303d99ca 100644 --- a/build.gradle +++ b/build.gradle @@ -23,6 +23,8 @@ dependencies { compileOnly 'org.jetbrains.kotlin:kotlin-script-runtime:1.6.20' api "org.apache.commons:commons-csv:1.6" // cant upgrade to 1.8 because of https://issues.apache.org/jira/browse/CSV-257 + api 'org.apache.arrow:arrow-vector:8.0.0' + implementation 'org.apache.arrow:arrow-memory-netty:8.0.0' api 'org.apache.poi:poi-ooxml:5.2.2' api 'com.beust:klaxon:5.6'// compile 'me.tongfei:progressbar:0.5.5' @@ -98,7 +100,7 @@ test { //http://stackoverflow.com/questions/34377367/why-is-gradle-install-replacing-my-version-with-unspecified group 'com.github.holgerbrandl' //version '0.16.95' -version '0.17.4-SNAPSHOT' +version '0.17.4' diff --git a/src/main/kotlin/krangl/ArrowIO.kt b/src/main/kotlin/krangl/ArrowIO.kt new file mode 100644 index 00000000..1318122f --- /dev/null +++ b/src/main/kotlin/krangl/ArrowIO.kt @@ -0,0 +1,318 @@ +package krangl + +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.BaseFixedWidthVector +import org.apache.arrow.vector.BigIntVector +import org.apache.arrow.vector.BitVector +import org.apache.arrow.vector.Float4Vector +import org.apache.arrow.vector.Float8Vector +import org.apache.arrow.vector.IntVector +import org.apache.arrow.vector.SmallIntVector +import org.apache.arrow.vector.TinyIntVector +import org.apache.arrow.vector.VarCharVector +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowFileReader +import org.apache.arrow.vector.ipc.ArrowFileWriter +import org.apache.arrow.vector.types.FloatingPointPrecision +import org.apache.arrow.vector.types.pojo.ArrowType +import org.apache.arrow.vector.types.pojo.Schema +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel +import org.apache.arrow.vector.util.Text +import java.io.ByteArrayOutputStream +import java.io.File +import java.nio.channels.* +import java.nio.file.StandardOpenOption +import java.util.* + +internal fun unwrapStringArrayFromArrow(vector: VarCharVector): ArrayList { + val result = ArrayList() + for (i in 0 until vector.valueCount) { + result.add(vector.getObject(i)?.toString()) + } + return result +} + +internal inline fun unwrapNumericVectorFromArrow(vector: BaseFixedWidthVector, elementClass: Class): List { + val elements = vector.valueCount + val outVector = ArrayList(elements) + for (i in 0 until elements) { + outVector.add(vector.getObject(i) as ELEMENT_TYPE?) + } + return outVector +} + +internal fun unwrapBooleanArrayFromArrow(vector: BitVector): ArrayList { + val result = ArrayList() + for (i in 0 until vector.valueCount) { + result.add(vector.getObject(i)) + } + return result +} + +fun DataFrame.Companion.arrowReader() = ArrowReader() + +class ArrowReader() { + /** + * Internal low-level function. + * Use this function if you are working with [VectorSchemaRoot]s directly in your project. + */ + fun fromVectorSchemaRoot(vectorSchemaRoot: VectorSchemaRoot): DataFrame { + val kranglVectors = vectorSchemaRoot.fieldVectors.map { fieldVector -> + when (fieldVector.field.type) { + is ArrowType.FixedSizeList, is ArrowType.List -> { + throw Exception("Matrices are not supported yet") + } + is ArrowType.Utf8 -> { + StringCol(fieldVector.name, unwrapStringArrayFromArrow(fieldVector as VarCharVector)) + } + is ArrowType.Int -> { + val bitWidth = (fieldVector.field.type as ArrowType.Int).bitWidth + when (bitWidth) { + 8 -> IntCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as TinyIntVector, Int::class.java)) + 16 -> IntCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as SmallIntVector, Int::class.java)) + 32 -> IntCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as IntVector, Int::class.java)) + 64 -> LongCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as BigIntVector, Long::class.java)) + else -> throw java.lang.Exception("Incorrect Int.bitWidth ($bitWidth, should never happen)") + } + } + is ArrowType.FloatingPoint -> { + val precision = (fieldVector.field.type as ArrowType.FloatingPoint).precision + when (precision) { + FloatingPointPrecision.HALF -> java.lang.Exception("HALF float not supported") + FloatingPointPrecision.SINGLE -> DoubleCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as Float4Vector, Double::class.java)) + FloatingPointPrecision.DOUBLE -> DoubleCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as Float8Vector, Double::class.java)) + else -> throw java.lang.Exception("Incorrect FloatingPoint.precision ($precision, should never happen)") + } + } + is ArrowType.Bool -> { + BooleanCol(fieldVector.name, unwrapBooleanArrayFromArrow(fieldVector as BitVector)) + } + else -> { + throw Exception("${fieldVector.field.type.typeID.name} is not supported yet") + } + } + } + + return dataFrameOf(*(kranglVectors as List).toTypedArray()) + } + + /** + * Read [VectorSchemaRoot] from existing [channel] and convert it to [DataFrame]. + * Use this function if you want to manage channels yourself, make in-memory IPC sharing and so on. + * If [allocator] is null, it will be created and closed inside. + */ + fun readFromChannel(channel: SeekableByteChannel, allocator: BufferAllocator?): DataFrame { + fun readFromChannelAllocating(channel: SeekableByteChannel, allocator: BufferAllocator?): DataFrame { + ArrowFileReader(channel, allocator).use { reader -> + reader.loadNextBatch() + return fromVectorSchemaRoot(reader.vectorSchemaRoot) + } + } + if (allocator == null ) { + RootAllocator().use { newAllocator -> + return readFromChannelAllocating(channel, newAllocator) + } + } else { + return readFromChannelAllocating(channel, allocator) + } + } + + /** + * Read [VectorSchemaRoot] from ByteArray and convert it to [DataFrame]. + */ + fun fromByteArray(byteArray: ByteArray): DataFrame { + return readFromChannel(ByteArrayReadableSeekableByteChannel(byteArray), null) + } + + /** + * Read [VectorSchemaRoot] from [file] by and convert it to [DataFrame]. + */ + fun fromFile(file: File): DataFrame { + if (!file.exists()) { + throw Exception("${file.path} does not exist") + } + if (file.isDirectory) { + throw Exception("${file.path} is directory") + } + FileChannel.open( + file.toPath(), + StandardOpenOption.READ + ).use { channel -> + return readFromChannel(channel, null) + } + } + + /** + * Read [VectorSchemaRoot] from file by [path] and convert it to [DataFrame]. + */ + fun fromFile(path: String): DataFrame { + return fromFile(File(path)) + } +} + +fun DataFrame.arrowWriter() = ArrowWriter(this) + +class ArrowWriter(val dataFrame: DataFrame) { + internal fun fromStringCol(column: StringCol, allocator: BufferAllocator): VarCharVector { + val fieldVector = VarCharVector(column.name, allocator) + fieldVector.allocateNew(column.length) + column.values.forEachIndexed { index, value -> + if (value == null) { + fieldVector.setNull(index) + } else { + fieldVector.setSafe(index, Text(value)) + } + } + fieldVector.valueCount = column.length + return fieldVector + } + + internal fun fromBooleanCol(column: BooleanCol, allocator: BufferAllocator): BitVector { + val fieldVector = BitVector(column.name, allocator) + fieldVector.allocateNew(column.length) + column.values.forEachIndexed { index, value -> + if (value == null) { + fieldVector.setNull(index) + } else { + fieldVector.setSafe(index, if (value) 1 else 0) + } + } + fieldVector.valueCount = column.length + return fieldVector + } + + internal fun fromIntCol(column: IntCol, allocator: BufferAllocator): IntVector { + val fieldVector = IntVector(column.name, allocator) + fieldVector.allocateNew(column.length) + column.values.forEachIndexed { index, value -> + if (value == null) { + fieldVector.setNull(index) + } else { + fieldVector.setSafe(index, value) + } + } + fieldVector.valueCount = column.length + return fieldVector + } + + internal fun fromLongCol(column: LongCol, allocator: BufferAllocator): BigIntVector { + val fieldVector = BigIntVector(column.name, allocator) + fieldVector.allocateNew(column.length) + column.values.forEachIndexed { index, value -> + if (value == null) { + fieldVector.setNull(index) + } else { + fieldVector.setSafe(index, value) + } + } + fieldVector.valueCount = column.length + return fieldVector + } + + internal fun fromDoubleCol(column: DoubleCol, allocator: BufferAllocator): Float8Vector { + val fieldVector = Float8Vector(column.name, allocator) + fieldVector.allocateNew(column.length) + column.values.forEachIndexed { index, value -> + if (value == null) { + fieldVector.setNull(index) + } else { + fieldVector.setSafe(index, value) + } + } + fieldVector.valueCount = column.length + return fieldVector + } + + internal fun fromAnyCol(column: AnyCol, allocator: BufferAllocator): VarCharVector { + val fieldVector = VarCharVector(column.name, allocator) + fieldVector.allocateNew(column.length) + column.values.forEachIndexed { index, value -> + if (value == null) { + fieldVector.setNull(index) + } else { + fieldVector.setSafe(index, Text(value.toString())) + } + } + fieldVector.valueCount = column.length + return fieldVector + } + + /** + * Internal low-level function. + * Use this function if you are working with [VectorSchemaRoot]s and [BufferAllocator]s directly in your project. + */ + fun allocateVectorSchemaRoot(allocator: BufferAllocator): VectorSchemaRoot { + val arrowVectors = dataFrame.cols.map { column -> + when (column) { + is StringCol -> fromStringCol(column, allocator) + is BooleanCol -> fromBooleanCol(column, allocator) + is IntCol -> fromIntCol(column, allocator) + is LongCol -> fromLongCol(column, allocator) + is DoubleCol -> fromDoubleCol(column, allocator) + is AnyCol -> fromAnyCol(column, allocator) + else -> { + throw Exception("Unknown column type ${column.javaClass.canonicalName}") + } + } + } + return VectorSchemaRoot(arrowVectors) + } + + /** + * Export [dataFrame] to [VectorSchemaRoot] and write it to any existing [channel]. + * Use this function if you want to manage channels yourself, make in-memory IPC sharing and so on + */ + fun writeToChannel(channel: WritableByteChannel) { + RootAllocator().use { allocator -> + this.allocateVectorSchemaRoot(allocator).use { vectorSchemaRoot -> + ArrowFileWriter(vectorSchemaRoot, null, channel).use { writer -> + writer.writeBatch(); + } + } + } + } + + /** + * Export [dataFrame] to [VectorSchemaRoot] and write it to new ByteArray. + */ + fun toByteArray(): ByteArray { + ByteArrayOutputStream().use { byteArrayStream -> + Channels.newChannel(byteArrayStream).use { channel -> + writeToChannel(channel) + return byteArrayStream.toByteArray() + } + } + } + + /** + * Export [dataFrame] to [VectorSchemaRoot] and write it to new or existing [file]. + * Temporary file is created if [file] argument is null. + */ + fun toFile(file: File?): File { + val saveToFile = file ?: File.createTempFile("DataFrame", ".arrow") + + FileChannel.open( + saveToFile.toPath(), + StandardOpenOption.WRITE, + StandardOpenOption.CREATE + ).use { channel -> + channel.truncate(0) + writeToChannel(channel) + } + return saveToFile + } + + /** + * Export [dataFrame] to [VectorSchemaRoot] and write it to new or existing file by [path]. + * Temporary file is created if [path] argument is null. + */ + fun toFile(path: String?): File { + val saveToFile = if (path != null) { + File(path) + } else { + File.createTempFile("DataFrame", ".arrow") + } + return toFile(saveToFile) + } +} diff --git a/src/main/kotlin/krangl/ExcelIO.kt b/src/main/kotlin/krangl/ExcelIO.kt index c2c589ae..192d4dc2 100644 --- a/src/main/kotlin/krangl/ExcelIO.kt +++ b/src/main/kotlin/krangl/ExcelIO.kt @@ -27,7 +27,7 @@ fun DataFrame.Companion.readExcel( cellRange: CellRangeAddress? = null, colTypes: ColumnTypeSpec = GuessSpec(), trim: Boolean = false, - guessMax: Int = 100, + guessMax: Int = GUESS_MAX, na: String = MISSING_VALUE, stopAtBlankLine: Boolean = true, includeBlankLines: Boolean = false, @@ -55,7 +55,7 @@ fun DataFrame.Companion.readExcel( cellRange: CellRangeAddress? = null, colTypes: ColumnTypeSpec = GuessSpec(), trim_ws: Boolean = false, - guessMax: Int = 100, + guessMax: Int = GUESS_MAX, na: String = MISSING_VALUE, stopAtBlankLine: Boolean = true, includeBlankLines: Boolean = false, @@ -84,23 +84,17 @@ private fun readExcelSheet( includeBlankLines: Boolean ): DataFrame { var df = emptyDataFrame() - val rowIterator = xlSheet.rowIterator() + val cellRange = range ?: getDefaultCellAddress(xlSheet) - if (!rowIterator.hasNext()) - return df + val rowsFromTo = cellRange.firstRow to cellRange.lastRow - // Skip lines until starting row number - var currentRow = rowIterator.next() - while (currentRow.rowNum < cellRange.firstRow - 1) { - if (!rowIterator.hasNext()) - return df - else { - currentRow = rowIterator.next() - } + val headerRow = xlSheet.getRow(rowsFromTo.first) + if (headerRow == null) { + return df } - val cellIterator = currentRow.iterator() + val cellIterator = headerRow.iterator() // Get column names val columnResults = getExcelColumnNames(cellIterator, df, cellRange) @@ -108,19 +102,15 @@ private fun readExcelSheet( cellRange.lastColumn = columnResults.second // Stops at first empty column header //Get rows - while (rowIterator.hasNext() && currentRow.rowNum < cellRange.lastRow) { - currentRow = rowIterator.next() - val values = readExcelRow(currentRow, cellRange, trim, na) + for (rowNumber in rowsFromTo.first + 1 .. rowsFromTo.second) { + val currentRow = xlSheet.getRow(rowNumber) + val values = currentRow?.let { readExcelRow(currentRow, cellRange, trim, na) } ?: arrayOfNulls(df.ncol).asList() //Prevent Excel reading blank lines (whose contents have been cleared but the lines weren't deleted) - if (values.filterNotNull().isNotEmpty()) + if (values.filterNotNull().isNotEmpty() || includeBlankLines) { df = df.addRow(values) - else - if (stopAtBlankLine) - break //Stops reading on first blank line - else - if (includeBlankLines) - df = df.addRow(values) + } else + if (stopAtBlankLine) break //Stops reading on first blank line } return assignColumnTypes(df, colTypes, guessMax) } @@ -161,12 +151,11 @@ private fun readExcelRow( if(floor(numValue) == numValue && !isInfinite(numValue)) numValue.toLong() else numValue } CellType.STRING -> currentCell.stringCellValue - CellType.BLANK -> null + CellType.BLANK -> "" CellType.BOOLEAN -> currentCell.booleanCellValue CellType._NONE, CellType.ERROR, CellType.FORMULA -> dataFormatter.formatCellValue(currentCell) } } -// var currentValue = currentCell?.let { dataFormatter.formatCellValue(currentCell) } if (currentValue is String) { if (trim) { @@ -177,7 +166,6 @@ private fun readExcelRow( currentValue = null } - currentValue = (currentValue as String?)?.ifBlank { null } } rowValues.add(currentValue) @@ -210,7 +198,7 @@ private fun getExcelColumnNames( return Pair(df1, lastColumn) } -private fun assignColumnTypes(df: DataFrame, colTypes: ColumnTypeSpec, guessMax: Int = 100): DataFrame { +private fun assignColumnTypes(df: DataFrame, colTypes: ColumnTypeSpec, guessMax: Int = GUESS_MAX): DataFrame { val colList = mutableListOf() @@ -283,27 +271,28 @@ private fun DataFrame.createExcelDataRows(sheet: Sheet, headers: Boolean) { val nRow = sheet.createRow(rowIdx++) for ((columnIndex, cellValue) in dfRow.values.toMutableList().withIndex()) { + if (cellValue == null) { + continue + } val cell = nRow.createCell(columnIndex) when (cols[columnIndex]) { is BooleanCol -> { - cell.cellType = CellType.BOOLEAN - cellValue?.let { cell.setCellValue(it as Boolean) } + cell.setCellValue(cellValue as Boolean) } is DoubleCol -> { - cell.cellType = CellType.NUMERIC - cellValue?.let { cell.setCellValue(it as Double) } + cell.setCellValue(cellValue as Double) } is IntCol -> { - cell.cellType = CellType.NUMERIC - cellValue?.let { cell.setCellValue((it as Int).toDouble()) } + cell.setCellValue((cellValue as Int).toDouble()) + } + is LongCol -> { + cell.setCellValue((cellValue as Long).toDouble()) } -// is StringCol -> cell.cellType= CellType.STRING else -> { - cellValue?.let { cell.setCellValue(cellValue.toString()) } + cell.setCellValue(cellValue.toString()) } } -// cell.setCellValue(cell.toString()) } } } @@ -324,4 +313,4 @@ private fun DataFrame.createExcelHeaderRow( fun main() { DataFrame.readExcel("src/test/resources/krangl/data/ExcelReadExample.xlsx") -} \ No newline at end of file +} diff --git a/src/main/kotlin/krangl/JsonIO.kt b/src/main/kotlin/krangl/JsonIO.kt index e53ddc10..470fb606 100644 --- a/src/main/kotlin/krangl/JsonIO.kt +++ b/src/main/kotlin/krangl/JsonIO.kt @@ -25,16 +25,14 @@ fun DataFrame.Companion.fromJson(fileOrUrl: String): DataFrame { return fromJson(url) } -const val ARRAY_ROWS_TYPE_DETECTING = 5 - @Suppress("UNCHECKED_CAST") -fun DataFrame.Companion.fromJson(url: URL, typeDetectingRows: Int? = ARRAY_ROWS_TYPE_DETECTING): DataFrame = - fromJsonArray(Parser.default().parse(url.openStream()) as JsonArray, typeDetectingRows) +fun DataFrame.Companion.fromJson(url: URL, guessMax: Int? = GUESS_MAX): DataFrame = + fromJsonArray(Parser.default().parse(url.openStream()) as JsonArray, guessMax) const val ARRAY_COL_ID = "_id" @Suppress("UNCHECKED_CAST") -fun DataFrame.Companion.fromJsonString(jsonData: String, typeDetectingRows: Int? = ARRAY_ROWS_TYPE_DETECTING): DataFrame { +fun DataFrame.Companion.fromJsonString(jsonData: String, guessMax: Int? = GUESS_MAX): DataFrame { val parsed = Parser.default().parse(StringReader(jsonData)) // var deparseJson = deparseJson(parsed) @@ -49,7 +47,7 @@ fun DataFrame.Companion.fromJsonString(jsonData: String, typeDetectingRows: Int? val jsonColDFs = jsonCol.values().map { colData -> when (colData) { - is JsonArray<*> -> fromJsonArray(colData as JsonArray, typeDetectingRows) + is JsonArray<*> -> fromJsonArray(colData as JsonArray, guessMax) is JsonObject -> when { colData.values.first() is JsonArray<*> -> { dataFrameOf( @@ -84,25 +82,14 @@ fun DataFrame.Companion.fromJsonString(jsonData: String, typeDetectingRows: Int? return df } -//Can this be removed? -private fun deparseJson(parsed: Any?, typeDetectingRows: Int? = ARRAY_ROWS_TYPE_DETECTING): DataFrame { - @Suppress("UNCHECKED_CAST") - return when (parsed) { - is JsonArray<*> -> fromJsonArray(parsed as JsonArray, typeDetectingRows) - is JsonObject -> dataFrameOf(parsed.keys)(parsed.values) - else -> throw IllegalArgumentException("Can not parse json. " + INTERNAL_ERROR_MSG) - } -} - - -internal fun fromJsonArray(records: JsonArray, typeDetectingRows: Int?): DataFrame { +internal fun fromJsonArray(records: JsonArray, guessMax: Int?): DataFrame { val colNames = records .map { it.keys.toList() } .reduceRight { acc, right -> acc + right.minus(acc) } val cols = colNames.map { colName -> - val firstRows = if (typeDetectingRows is Int) { - records.take(typeDetectingRows) + val firstRows = if (guessMax is Int) { + records.take(guessMax) } else { records } @@ -174,5 +161,5 @@ fun DataFrame.toJsonString(prettyPrint: Boolean = false, asObject: Boolean = fal } fun main(args: Array) { - DataFrame.fromJson("https://raw.githubusercontent.com/vega/vega/master/test/data/movies.json") + DataFrame.fromJson("https://raw.githubusercontent.com/vega/vega/main/docs/data/movies.json") } diff --git a/src/main/kotlin/krangl/TableIO.kt b/src/main/kotlin/krangl/TableIO.kt index c0fe8166..cc5f8605 100644 --- a/src/main/kotlin/krangl/TableIO.kt +++ b/src/main/kotlin/krangl/TableIO.kt @@ -308,6 +308,8 @@ fun DataFrame.Companion.readFixedWidth( val MISSING_VALUE = "NA" +const val GUESS_MAX = 100 + // NA aware conversions internal fun String.naAsNull(): String? = if (this == MISSING_VALUE) null else this @@ -324,7 +326,7 @@ internal fun String?.nullAsNA(): String = this ?: MISSING_VALUE // } internal fun String?.cellValueAsBoolean(): Boolean? { - if (this == null) return null + if (this == null || this == "") return null var cellValue: String? = toUpperCase() @@ -377,7 +379,7 @@ internal fun dataColFactory( // TODO add missing value support with user defined string (e.g. NA here) here -internal fun dataColFactory(colName: String, colType: ColType, records: Array<*>, guessMax: Int = 100): DataCol = +internal fun dataColFactory(colName: String, colType: ColType, records: Array<*>, guessMax: Int = GUESS_MAX): DataCol = when (colType) { // see https://github.com/holgerbrandl/krangl/issues/10 ColType.Int -> try { @@ -445,7 +447,7 @@ internal fun peekCol(colIndex: Int, records: List, peekSize: Int = 10 internal fun peekCol(records: Array<*>, peekSize: Int = 100) = records .asSequence() - .map { it.toString() } + .map { it?.toString() } .filterNotNull() .take(peekSize) .toList() @@ -623,4 +625,4 @@ val flightsData by lazy { // consider to use progress bar here } -// todo support Read and write data using Tablesaw’s “.saw” format --> use dedicated artifact to minimize dependcies \ No newline at end of file +// todo support Read and write data using Tablesaw’s “.saw” format --> use dedicated artifact to minimize dependcies diff --git a/src/test/kotlin/krangl/test/ArrowTests.kt b/src/test/kotlin/krangl/test/ArrowTests.kt new file mode 100644 index 00000000..0df1dda4 --- /dev/null +++ b/src/test/kotlin/krangl/test/ArrowTests.kt @@ -0,0 +1,41 @@ +package krangl.test + +import io.kotest.matchers.shouldBe +import krangl.DataFrame +import krangl.arrowReader +import krangl.arrowWriter +import krangl.fromJsonString +import org.junit.Test + +class ArrowTests { + @Test + fun savingToArrow() { + val df1 = DataFrame.fromJsonString( + """ + { + "cars": { + "Nissan": [ + {"model":"Sentra", "doors":4, "weight":1, }, + {"model":"Maxima", "doors":4, "weight":1.3}, + {"model":"Leaf", "doors":4, "electrical":true}, + {"model":"Skyline", "doors":2, "electrical":false} + ], + "Ford": [ + {"model":"Taurus", "doors":4, "weight":2, "electrical":false}, + {"model":"Escort", "doors":4, "seats":5, "weight":1} + ], + "Tesla": [ + {"electrical":true} + ] + } + } + """ + ) + val data = df1.arrowWriter().toByteArray() + val df2 = DataFrame.arrowReader().fromByteArray(data) + + df2.shouldBe(df1) + //Save to file for test reading from another language (Python or R) + //df1.arrowWriter().toFile("test.arrow") + } +} diff --git a/src/test/kotlin/krangl/test/ExcelTests.kt b/src/test/kotlin/krangl/test/ExcelTests.kt index b0a8eb81..d2be0a61 100644 --- a/src/test/kotlin/krangl/test/ExcelTests.kt +++ b/src/test/kotlin/krangl/test/ExcelTests.kt @@ -2,19 +2,27 @@ package krangl.test import io.kotest.matchers.shouldBe import krangl.* +import org.apache.poi.ss.usermodel.CellType import org.apache.poi.ss.util.CellRangeAddress import org.apache.poi.util.LocaleUtil +import org.apache.poi.xssf.streaming.SXSSFWorkbook +import org.apache.poi.xssf.usermodel.XSSFWorkbook import org.junit.Test +import java.io.FileInputStream import java.util.* class ExcelTests { + private val testReadingPath = "src/test/resources/krangl/data/ExcelReadExample.xlsx" + private val testWritingPath = "src/test/resources/krangl/data/ExcelWriteResult.xlsx" + + @Test fun `readExcel - should read excel file`() { val df = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet" ) @@ -28,12 +36,12 @@ class ExcelTests { @Test fun `readExcel - sheet by name should match sheet by index`() { val nameDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet" ) val indexDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, 0 ) @@ -43,7 +51,7 @@ class ExcelTests { @Test fun `readExcel - out of range test`() { val headerHigherThanContentDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet", CellRangeAddress.valueOf("A105:A110") ) @@ -53,19 +61,19 @@ class ExcelTests { @Test fun `readExcel - range test`() { val df = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet" ) // Test sheet by index + cell range val cellRangeTestDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, sheet = 1, cellRange = CellRangeAddress.valueOf("E5:J105"), trim = true ) // Test defaulted cellRange's correctness on sheet with empty rows/cols val defaultCellRangeTestDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, sheet = 1, trim = true ) @@ -76,11 +84,11 @@ class ExcelTests { @Test fun `readExcel - trim_ws should trim white space`() { val df = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet" ) val trimmedDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, sheet = 1, trim = true ) @@ -90,7 +98,7 @@ class ExcelTests { @Test fun `readExcel - colTypes should work`() { val df = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet", colTypes = NamedColumnSpec("Activities" to ColType.Int) ) @@ -101,7 +109,7 @@ class ExcelTests { @Test fun `readExcel - should stop at first blank line`() { val shouldStopAtBlankDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, sheet = 2, trim = true, cellRange = CellRangeAddress.valueOf("E3:J10") ) @@ -111,7 +119,7 @@ class ExcelTests { @Test fun `readExcel - should continue past blank line`() { val shouldContinueAtBlankDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, sheet = 2, trim = true, cellRange = CellRangeAddress.valueOf("E3:J10"), stopAtBlankLine = false ) @@ -121,7 +129,7 @@ class ExcelTests { @Test fun `readExcel - should include blank lines`() { val shouldContinueAtBlankDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, sheet = 2, trim = true, cellRange = CellRangeAddress.valueOf("E3:J10"), @@ -133,39 +141,39 @@ class ExcelTests { } @Test - fun `readExcel - should read bigint value`() { + fun `readExcel - should read bigint value`() { val df = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet" ) - df["Activities"][1] shouldBe "432178937489174" + df["Activities"][1] shouldBe 432178937489174 } @Test fun `writeExcel - should write to excel`() { val df = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelReadExample.xlsx", + testReadingPath, "FirstSheet" ) df.writeExcel( - "src/test/resources/krangl/data/ExcelWriteResult.xlsx", + testWritingPath, "FirstSheet", headers = true, eraseFile = true, boldHeaders = false ) df.writeExcel( - "src/test/resources/krangl/data/ExcelWriteResult.xlsx", + testWritingPath, "SecondSheet", headers = true, eraseFile = false, boldHeaders = true ) df.writeExcel( - "src/test/resources/krangl/data/ExcelWriteResult.xlsx", + testWritingPath, "ThirdSheet", headers = false, eraseFile = false, @@ -173,11 +181,18 @@ class ExcelTests { ) val writtenDF = DataFrame.readExcel( - "src/test/resources/krangl/data/ExcelWriteResult.xlsx", + testWritingPath, "FirstSheet" ) writtenDF shouldBe df + + val writtenBook = XSSFWorkbook(FileInputStream(testWritingPath)) + val longValueCell = writtenBook.getSheet("FirstSheet").getRow(2).getCell(4) + longValueCell.cellType.shouldBe(CellType.NUMERIC) + longValueCell.numericCellValue.shouldBe(432178937489174.0) + val emptyValueCell = writtenBook.getSheet("FirstSheet").getRow(6).getCell(4) + emptyValueCell shouldBe null } @Test @@ -192,7 +207,7 @@ class ExcelTests { df.print(maxWidth = 1000) df.schema() - df[1][4] shouldBe null + df[1][4] shouldBe "" df[3][1] shouldBe null df[5][3] shouldBe null @@ -219,4 +234,25 @@ class ExcelTests { LocaleUtil.setUserLocale(defaultLocale) } -} \ No newline at end of file + + @Test + fun `it should distinguish empty strings and nulls`() { + val df1 = dataFrameOf(listOf( + mapOf("col" to "NotEmptyString1", "col2" to 1), + mapOf("col" to "", "col2" to 2), + mapOf("col" to null, "col2" to 3), + mapOf("col" to "NotEmptyString2", "col2" to 4), + )) + df1.writeExcel(testWritingPath, "test", eraseFile = true) + DataFrame.readExcel(testWritingPath, "test") shouldBe df1 + + val df2 = dataFrameOf(listOf( + mapOf("col" to "NotEmptyString1"), + mapOf("col" to ""), + mapOf("col" to null), + mapOf("col" to "NotEmptyString2"), + )) + df2.writeExcel("testNull.xlsx", "test", eraseFile = true) + DataFrame.readExcel("testNull.xlsx", "test", stopAtBlankLine = false, includeBlankLines = true) shouldBe df2 + } +} diff --git a/src/test/resources/krangl/data/ExcelReadExample.xlsx b/src/test/resources/krangl/data/ExcelReadExample.xlsx index dfd139f1..e1455791 100644 Binary files a/src/test/resources/krangl/data/ExcelReadExample.xlsx and b/src/test/resources/krangl/data/ExcelReadExample.xlsx differ