diff --git a/docs/README.data.md b/docs/README.data.md index 4aea01c..c486461 100644 --- a/docs/README.data.md +++ b/docs/README.data.md @@ -73,7 +73,7 @@ contains the response variable. #### How to create a Feature Selection Spark Data Frame (FSDF) ```python -from fsspark.context import init_spark, stop_spark_session +from fsspark.config.context import init_spark, stop_spark_session from fsspark.fs.core import FSDataFrame from fsspark.utils.io import import_table_as_psdf @@ -81,12 +81,12 @@ from fsspark.utils.io import import_table_as_psdf init_spark() # Import data -psdf = import_table_as_psdf('data.tsv.bgz', - sep='\t', +psdf = import_table_as_psdf('data.tsv.bgz', + sep='\t', n_partitions=5) # Create FSDataFrame -fsdf = FSDataFrame(psdf, - sample_col='sample_id', +fsdf = FSDataFrame(psdf, + sample_col='sample_id', label_col='response') # Stop spark stop_spark_session() diff --git a/docs/README.methods.md b/docs/README.methods.md index 55b0a03..7b9caa2 100644 --- a/docs/README.methods.md +++ b/docs/README.methods.md @@ -64,7 +64,7 @@ multivariate correlation filter and Randon Forest classification. """ -from fsspark.context import init_spark, stop_spark_session +from fsspark.config.context import init_spark, stop_spark_session from fsspark.fs.core import FSDataFrame from fsspark.fs.ml import cv_rf_classification, get_accuracy, get_predictions from fsspark.fs.multivariate import multivariate_filter diff --git a/fsspark/config/__init__.py b/fsspark/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fsspark/context.py b/fsspark/config/context.py similarity index 88% rename from fsspark/context.py rename to fsspark/config/context.py index 8f0aaa8..5b8a9cf 100644 --- a/fsspark/context.py +++ b/fsspark/config/context.py @@ -2,9 +2,9 @@ import pyspark from pyspark.sql import SparkSession -from fsspark.global_settings import (SPARK_EXTRA_SETTINGS, - PYARROW_SETTINGS, - PANDAS_ON_SPARK_API_SETTINGS) +from fsspark.config.global_settings import (SPARK_EXTRA_SETTINGS, + PYARROW_SETTINGS, + PANDAS_ON_SPARK_API_SETTINGS) os.environ['PYARROW_IGNORE_TIMEZONE'] = "1" diff --git a/fsspark/global_settings.py b/fsspark/config/global_settings.py similarity index 84% rename from fsspark/global_settings.py rename to fsspark/config/global_settings.py index 7b5929e..53675ee 100644 --- a/fsspark/global_settings.py +++ b/fsspark/config/global_settings.py @@ -3,10 +3,10 @@ # spark settings to test this module locally. SPARK_EXTRA_SETTINGS = {'spark.executor.memory': '8g', - 'spark.driver.memory': '20g', + 'spark.driver.memory': '16g', "spark.memory.offHeap.enabled": 'true', - "spark.memory.offHeap.size": '2g', - "spark.sql.pivotMaxValues": '60000', + "spark.memory.offHeap.size": '4g', + "spark.sql.pivotMaxValues": '100000', "spark.network.timeout": '100000', "spark.sql.session.timeZone": "UTC" } diff --git a/fsspark/pipeline/fs_corr_rf.py b/fsspark/pipeline/fs_corr_rf.py index 49fe715..1d6f369 100644 --- a/fsspark/pipeline/fs_corr_rf.py +++ b/fsspark/pipeline/fs_corr_rf.py @@ -6,7 +6,7 @@ """ -from fsspark.context import init_spark, stop_spark_session +from fsspark.config.context import init_spark, stop_spark_session from fsspark.fs.core import FSDataFrame from fsspark.fs.ml import cv_rf_classification, get_accuracy, get_predictions, get_feature_scores from fsspark.fs.multivariate import multivariate_filter diff --git a/fsspark/tests/test_FSDataFrame.py b/fsspark/tests/test_FSDataFrame.py index b0040ad..2376b99 100644 --- a/fsspark/tests/test_FSDataFrame.py +++ b/fsspark/tests/test_FSDataFrame.py @@ -1,9 +1,9 @@ import unittest -from fsspark.context import init_spark, stop_spark_session +from fsspark.config.context import init_spark, stop_spark_session from fsspark.fs.core import FSDataFrame from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.utils.io import import_table, import_table_as_psdf +from fsspark.utils.io import import_table_as_psdf class FSDataFrameTest(unittest.TestCase): diff --git a/fsspark/tests/test_import_export.py b/fsspark/tests/test_import_export.py index 5ddce64..57b0c5b 100644 --- a/fsspark/tests/test_import_export.py +++ b/fsspark/tests/test_import_export.py @@ -3,7 +3,7 @@ import pyspark import pyspark.pandas as ps -from fsspark.context import init_spark, stop_spark_session +from fsspark.config.context import init_spark, stop_spark_session from fsspark.utils.datasets import get_tnbc_data_path from fsspark.utils.io import import_table, import_table_as_psdf diff --git a/fsspark/utils/io.py b/fsspark/utils/io.py index adeca8e..59321a5 100644 --- a/fsspark/utils/io.py +++ b/fsspark/utils/io.py @@ -3,7 +3,7 @@ import pyspark.pandas import pyspark.sql -from fsspark.context import PANDAS_ON_SPARK_API_SETTINGS +from fsspark.config.context import PANDAS_ON_SPARK_API_SETTINGS warnings.filterwarnings("ignore")