diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala index c1365d1b5ae1c..269a8b80c2b77 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala @@ -197,19 +197,21 @@ private[sql] object AvroUtils extends Logging { def hasNextRow: Boolean = { while (!completed && currentRow.isEmpty) { - if (fileReader.pastSync(stopPosition)) { + // In the case of empty blocks in an Avro file, `blockRemaining` could still read as 0 so + // `fileReader.hasNext()` returns false but advances the cursor to the next block, so we + // need to call `fileReader.hasNext()` again to correctly report if the next record + // exists. + val moreData = + (fileReader.hasNext || fileReader.hasNext) && !fileReader.pastSync(stopPosition) + if (!moreData) { fileReader.close() completed = true currentRow = None - } else if (fileReader.hasNext()) { + } else { val record = fileReader.next() // the row must be deserialized in hasNextRow, because AvroDeserializer#deserialize // potentially filters rows currentRow = deserializer.deserialize(record).asInstanceOf[Option[InternalRow]] - } else { - // In this case, `fileReader.hasNext()` returns false but we are not past sync point yet. - // This means empty blocks, we need to continue reading the file in case there are non - // empty blocks or we are past sync point. } } currentRow.isDefined diff --git a/connector/avro/src/test/resources/empty_file.avro b/connector/avro/src/test/resources/empty_file.avro new file mode 100644 index 0000000000000..9c7e15352149b Binary files /dev/null and b/connector/avro/src/test/resources/empty_file.avro differ diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index 1065a332cfc2a..3db95835e2f8b 100644 --- a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -2744,6 +2744,16 @@ abstract class AvroSuite } } } + + test("SPARK-46990: read an empty file where pastSync returns false at EOF") { + for (maxPartitionBytes <- Seq(100, 100000, 100000000)) { + withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> s"$maxPartitionBytes") { + val file = getResourceAvroFilePath("empty_file.avro") + val df = spark.read.format("avro").load(file) + assert(df.count() == 0) + } + } + } } class AvroV1Suite extends AvroSuite {