logicalclocks · SirOibaf · Jan 28, 2021 · Jan 26, 2021 · Jan 26, 2021 · Jan 26, 2021
diff --git a/python/hsfs/core/tfdata_engine.py b/python/hsfs/core/tfdata_engine.py
@@ -76,9 +76,7 @@ def __init__(
         self._training_dataset_format = self._training_dataset.data_format
 
         self._input_files = self._get_training_dataset_files(
-            self._training_dataset.location,
-            self._split,
-            filter_empty=True if self._training_dataset_format == "csv" else False,
+            self._training_dataset.location, self._split
         )
 
         if self._feature_names is None:
@@ -380,9 +378,7 @@ def _get_tf_dataset(self, input_files, cycle_length):
 
         return dataset, tfrecord_feature_description
 
-    def _get_training_dataset_files(
-        self, training_dataset_location, split, filter_empty=False
-    ):
+    def _get_training_dataset_files(self, training_dataset_location, split):
         """
         returns list of absolute path of training input files
         :param training_dataset_location: training_dataset_location
@@ -395,7 +391,7 @@ def _get_training_dataset_files(
 
         if training_dataset_location.startswith("hopsfs"):
             input_files = self._get_hopsfs_dataset_files(
-                training_dataset_location, split, filter_empty
+                training_dataset_location, split
             )
         elif training_dataset_location.startswith("s3"):
             input_files = self._get_s3_dataset_files(training_dataset_location, split)
@@ -405,7 +401,7 @@ def _get_training_dataset_files(
         return input_files
 
     @staticmethod
-    def _get_hopsfs_dataset_files(training_dataset_location, split, filter_empty):
+    def _get_hopsfs_dataset_files(training_dataset_location, split):
         path = training_dataset_location.replace("hopsfs", "hdfs")
         if split is None:
             path = hdfs.path.abspath(path)
@@ -417,19 +413,12 @@ def _get_hopsfs_dataset_files(training_dataset_location, split, filter_empty):
         all_list = hdfs.ls(path, recursive=True)
 
         # Remove directories and spark '_SUCCESS'
-        include_file = True
         for file in all_list:
             # remove empty file if any
-            if filter_empty:
-                _file_size = hdfs.path.getsize(file)
-                if _file_size == 0:
-                    include_file = False
-                else:
-                    include_file = True
             if (
                 not hdfs.path.isdir(file)
                 and not file.endswith("_SUCCESS")
-                and include_file
+                and hdfs.path.getsize(file) >= 1
             ):
                 input_files.append(file)