Skip to content

Commit

Permalink
feat: Enable Arrow-based columnar data transfers when to pandas in sp…
Browse files Browse the repository at this point in the history
…arksource retrieval job

Signed-off-by: tanlocnguyen <tanlocnguyen296@gmail.com>
  • Loading branch information
ElliotNguyen68 committed Mar 9, 2024
1 parent e6fc736 commit 4bfaff4
Showing 1 changed file with 7 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ def to_spark_df(self) -> pyspark.sql.DataFrame:

def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame:
"""Return dataset as Pandas DataFrame synchronously"""
spark_session = get_spark_session_or_start_new_with_repoconfig(
self._config.offline_store
)
spark_session.conf.set("spark.sql.execution.arrow.fallback.enabled", "true")
spark_session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
return self.to_spark_df().toPandas()

def _to_arrow_internal(self, timeout: Optional[int] = None) -> pyarrow.Table:
Expand Down Expand Up @@ -496,7 +501,8 @@ def _get_entity_df_event_timestamp_range(


def _get_entity_schema(
spark_session: SparkSession, entity_df: Union[pandas.DataFrame, str]
spark_session: SparkSession,
entity_df: Union[pandas.DataFrame, str],
) -> Dict[str, np.dtype]:
if isinstance(entity_df, pd.DataFrame):
return dict(zip(entity_df.columns, entity_df.dtypes))
Expand Down

0 comments on commit 4bfaff4

Please sign in to comment.