From 129cfe41a45be6e609e8697a5d2a23fccb07faa9 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 5 Sep 2024 16:41:39 -0600 Subject: [PATCH 1/3] add long conversions --- flasc/flasc_dataframe.py | 137 ++++++++++++++++++++++++++-------- tests/flasc_dataframe_test.py | 117 +++++++++++++++++++++++------ 2 files changed, 199 insertions(+), 55 deletions(-) diff --git a/flasc/flasc_dataframe.py b/flasc/flasc_dataframe.py index f22862f4..c7a73c5b 100644 --- a/flasc/flasc_dataframe.py +++ b/flasc/flasc_dataframe.py @@ -1,4 +1,5 @@ """FLASC DataFrame module.""" + from pandas import DataFrame @@ -34,7 +35,7 @@ class FlascDataFrame(DataFrame): # Attributes to pickle must be in this list _metadata = ["name_map", "_user_format"] - def __init__(self, *args, name_map=None, **kwargs): + def __init__(self, *args, name_map=None, in_flasc_format=True, user_format="wide", **kwargs): """Initialize the FlascDataFrame class, a subclass of pandas.DataFrame. Args: @@ -42,12 +43,13 @@ def __init__(self, *args, name_map=None, **kwargs): name_map (dict): Dictionary of column names to map from the user format to the FLASC format, where the key string is the user format and the value string is the FLASC equivalent. Defaults to None. + in_flasc_format (bool): Whether the data is in FLASC format. Defaults to True. + user_format (str): The format that the user expects the data to be in. Must be one of + 'long', 'semiwide', or 'wide'. Defaults to 'wide'. **kwargs: keyword arguments to pass to the DataFrame constructor """ super().__init__(*args, **kwargs) - self._user_format = "wide" # or "long" or "semiwide" - # check that name_map dictionary is valid if name_map is not None: if not isinstance(name_map, dict): @@ -55,8 +57,26 @@ def __init__(self, *args, name_map=None, **kwargs): if not all(isinstance(k, str) and isinstance(v, str) for k, v in name_map.items()): raise ValueError("name_map must be a dictionary of strings") self.name_map = name_map - # Apply the name_map - self.convert_to_flasc_format(inplace=True) # Do we want to do this here? + + # Save the reversed name_map (to go to user_format) + self._name_map_to_user = ( + {v: k for k, v in name_map.items()} if name_map is not None else None + ) + + # Set the format + self._in_flasc_format = in_flasc_format + + # Save the user format + if user_format not in ["long", "semiwide", "wide"]: + raise ValueError("user_format must be one of 'long', 'semiwide', 'wide'") + self._user_format = user_format + + # I think we should not convert to allow to stay in user format + # # Convert to flasc format if not already + # if not in_flasc_format: + # self.convert_to_flasc_format(inplace=True) + # else: + # self._in_flasc_format = True @property def _constructor(self): @@ -95,55 +115,110 @@ def convert_to_user_format(self, inplace=False): """Convert the DataFrame to the format that the user expects, given the name_map.""" # Convert the format if self._user_format == "long": - self._convert_wide_to_long() # Should this be assigned to something? + df_user = self._convert_wide_to_long() elif self._user_format == "semiwide": - self._convert_wide_to_semiwide() # Should this be assigned to something? + df_user = self._convert_wide_to_semiwide() elif self._user_format == "wide": - pass - - # Set the flag - self._in_flasc_format = False - - # Convert column names and return - if self.name_map is not None: - return self.rename(columns={v: k for k, v in self.name_map.items()}, inplace=inplace) + df_user = self.copy() + + # In wide to wide conversion, only need to rename the columns + if self.name_map is not None: + df_user.rename(self._name_map_to_user, inplace=inplace) + + # Assign to self or return + if inplace: + self.__init__( + df_user, + name_map=self.name_map, + in_flasc_format=False, + user_format=self._user_format, + ) else: - return None if inplace else self.copy() + # Force in flasc format to False + df_user._in_flasc_format = False + + return df_user def convert_to_flasc_format(self, inplace=False): """Convert the DataFrame to the format that FLASC expects.""" # Convert the format if self._user_format == "long": - self._convert_long_to_wide() # Should this be assigned to something? + df_flasc = self._convert_long_to_wide() # Should this be assigned to something? elif self._user_format == "semiwide": - self._convert_semiwide_to_wide() # Should this be assigned to something? + df_flasc = self._convert_semiwide_to_wide() # Should this be assigned to something? elif self._user_format == "wide": - pass - - # Set the flag - self._in_flasc_format = True - - # Convert column names and return - if self.name_map is not None: - return self.rename(columns=self.name_map, inplace=inplace) + df_flasc = self.copy() + + # In wide to wide conversion, only need to rename the columns + if self.name_map is not None: + df_flasc.rename(columns=self.name_map, inplace=inplace) + + # Assign to self or return + if inplace: + self.__init__( + df_flasc, + name_map=self.name_map, + in_flasc_format=True, + user_format=self._user_format, + ) else: - return None if inplace else self.copy() + # Force in flasc format to True + df_flasc._in_flasc_format = True + + return df_flasc def _convert_long_to_wide(self): """Convert a long format DataFrame to a wide format DataFrame.""" - # raise NotImplementedError("TO DO") - pass + # Start by converting the variable names + df_wide = self.copy() + if df_wide.name_map is not None: + df_wide["variable"] = df_wide["variable"].map(df_wide.name_map) + + # Pivot the table so the variable column becomes the column names with time + # kept as the first column and value as the values + df_wide = df_wide.pivot(index="time", columns="variable", values="value").reset_index() + + # Remove the name + df_wide.columns.name = None + + # Reset the index to make the time column a regular column + return FlascDataFrame( + df_wide, + name_map=self.name_map, + in_flasc_format=self._in_flasc_format, + user_format=self._user_format, + ) def _convert_semiwide_to_wide(self): """Convert a semiwide format DataFrame to a wide format DataFrame.""" raise NotImplementedError("TO DO") def _convert_wide_to_long(self): - """Convert a wide format DataFrame to a long format DataFrame.""" + """Convert a wide format DataFrame to a long format DataFrame. + + Returns: + FlascDataFrame: Long format FlascDataFrame + + """ if "time" not in self.columns: raise ValueError("Column 'time' must be present in the DataFrame") - return self.melt(id_vars="time", var_name="variable", value_name="value") + df_long = self.melt(id_vars="time", var_name="variable", value_name="value").sort_values( + ["time", "variable"] + ) + + if self.name_map is not None: + df_long["variable"] = df_long["variable"].map(self._name_map_to_user) + + # Reset index for cleanliness + df_long = df_long.reset_index(drop=True) + + return FlascDataFrame( + df_long, + name_map=self.name_map, + in_flasc_format=self._in_flasc_format, + user_format=self._user_format, + ) def _convert_wide_to_semiwide(self): """Convert a wide format DataFrame to a semiwide format DataFrame.""" diff --git a/tests/flasc_dataframe_test.py b/tests/flasc_dataframe_test.py index c0a8baf7..573779c8 100644 --- a/tests/flasc_dataframe_test.py +++ b/tests/flasc_dataframe_test.py @@ -5,16 +5,45 @@ from flasc.flasc_dataframe import FlascDataFrame -test_data_dict = {"time": [0, 10, 20], "a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - -test_name_map = {"a": "AA"} +# Define dataframes in each format that relate through the test name map +test_wide_dict = { + "time": [0, 10, 20], + "pow_000": [0, 100, 200], + "ws_000": [8, 8, 8], + "pow_001": [50, 150, 250], + "ws_001": [9, 9, 9], +} + +test_name_map = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001", "T2WS": "ws_001"} + + +test_semi_wide_dict = { + "time": [0, 0, 10, 10, 20, 20], + "turbine_id": [0, 1, 0, 1, 0, 1], + "pow": [0, 50, 100, 150, 200, 250], + "ws": [8, 9, 8, 9, 8, 9], +} + +test_long_dict = { + "time": [0, 0, 0, 0, 10, 10, 10, 10, 20, 20, 20, 20], + "variable": ["T1PWR", "T2PWR", "T1WS", "T2WS"] * 3, + "value": [0, 50, 8, 9, 100, 150, 8, 9, 200, 250, 8, 9], +} + +test_wide_user_dict = { + "time": [0, 10, 20], + "T1PWR": [0, 100, 200], + "T1WS": [8, 8, 8], + "T2PWR": [50, 150, 250], + "T2WS": [9, 9, 9], +} def test_type(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) assert isinstance(df, FlascDataFrame) - df2 = df.drop(columns="c") # Modifies the dataframe, returns a copy + df2 = df.drop(columns="ws_001") # Modifies the dataframe, returns a copy assert isinstance(df2, FlascDataFrame) # Assert df is a pandas DataFrame @@ -22,10 +51,10 @@ def test_type(): def test__metadata(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) df._user_format = "long" df._in_flasc_format = False - df2 = df.drop(columns="c") # Modifies the dataframe, returns a copy + df2 = df.drop(columns="ws_001") # Modifies the dataframe, returns a copy assert hasattr(df2, "name_map") assert df2.name_map == test_name_map assert hasattr(df2, "_user_format") @@ -37,7 +66,7 @@ def test__metadata(): def test_printout(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) df._in_flasc_format = True print(df) print("\n") @@ -48,7 +77,7 @@ def test_printout(): def test_check_flasc_format(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) # Should not raise an error df.check_flasc_format() @@ -61,19 +90,63 @@ def test_check_flasc_format(): def test_convert_to_long_format(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) - df._user_format = "long" # Should be detected internally - df.convert_to_user_format(inplace=True) # Should not pass + df_wide = FlascDataFrame(test_wide_dict, name_map=test_name_map) + df_long_test = pd.DataFrame(test_long_dict) + + # Test conversion with return + df_wide._user_format = "long" # Should be detected internally + df_wide_copy = df_wide.copy() + df_long = df_wide.convert_to_user_format(inplace=False) + + # Test df_long is not in flasc format + assert not df_long._in_flasc_format + + # Test returned frame is matched to expected value + pd.testing.assert_frame_equal(df_long, df_long_test) + + # Test original frame is unchanged + pd.testing.assert_frame_equal(df_wide, df_wide_copy) + + # Now test in place conversion + df_wide.convert_to_user_format(inplace=True) + pd.testing.assert_frame_equal(df_wide, df_long_test) + + # Assert not in flasc format + assert not df_wide._in_flasc_format + + # Now test the back conversion + df_back_to_wide = df_wide.convert_to_flasc_format(inplace=False) + + # Resort the columns to match + df_back_to_wide = df_back_to_wide[df_wide_copy.columns] + + pd.testing.assert_frame_equal(df_back_to_wide, df_wide_copy) + + # Assert is in flasc format + assert df_back_to_wide._in_flasc_format + + # Test in place version + df_wide.convert_to_flasc_format(inplace=True) + + # Sort columns to match + df_wide = df_wide[df_wide_copy.columns] + + pd.testing.assert_frame_equal(df_wide, df_wide_copy) # Check operation not allowed if no "time" column - df.convert_to_flasc_format(inplace=True) - df.drop(columns="time", inplace=True) + df_wide.drop(columns="time", inplace=True) with pytest.raises(ValueError): - df.convert_to_user_format(inplace=True) + df_wide.convert_to_user_format(inplace=True) + + +def test_convert_to_wide_format(): + # Test wide to wide conversion + + pass def test_pickle(): - df = FlascDataFrame(test_data_dict) + df = FlascDataFrame(test_wide_dict) df.name_map = test_name_map df.to_pickle("test_pickle.pkl") @@ -85,7 +158,7 @@ def test_pickle(): def test_feather(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) df.to_feather("test_feather.ftr") df2 = pd.read_feather("test_feather.ftr") @@ -98,7 +171,7 @@ def test_feather(): def test_csv(): - df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) df.to_csv("test_csv.csv") df2 = pd.read_csv("test_csv.csv") @@ -112,15 +185,11 @@ def test_csv(): def test_n_turbines(): # Currently, n_turbines based only on number of pow columns - name_map = {"a": "pow_000", "b": "pow_001", "c": "ws_000"} - df = FlascDataFrame(test_data_dict, name_map=name_map) + df = FlascDataFrame(test_wide_dict, name_map=test_name_map) assert df.n_turbines == 2 - name_map = {"a": "pow_000", "b": "ws_000", "c": "ws_001"} - df = FlascDataFrame(test_data_dict, name_map=name_map) - assert df.n_turbines == 1 - # Check n_turbines not valid if not in flasc format + df._user_format = "long" df.convert_to_user_format(inplace=True) with pytest.raises(ValueError): df.n_turbines From 28f9ff1f4816ab053956311b8a43fd441b26d0cf Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 9 Sep 2024 15:47:04 -0600 Subject: [PATCH 2/3] Update to long/wide only --- flasc/flasc_dataframe.py | 235 ++++++++++++++++++---------------- tests/flasc_dataframe_test.py | 197 ++++++++++++++++++---------- 2 files changed, 250 insertions(+), 182 deletions(-) diff --git a/flasc/flasc_dataframe.py b/flasc/flasc_dataframe.py index c7a73c5b..fbf12752 100644 --- a/flasc/flasc_dataframe.py +++ b/flasc/flasc_dataframe.py @@ -7,76 +7,76 @@ class FlascDataFrame(DataFrame): """Subclass of pandas.DataFrame for working with FLASC data. - I think it makes most sense to store it as FLASC expects it: - - with the correct column names - - in wide format + Stores data in preferred Flasc format, or user format, with option to convert between the two. - Then, can offer a transformation to export as the user would like it, for them to work on it - further. How, then, would I revert it back to the needed format - - - Two possible types of data we should try to handle: - 1. Semiwide: - - One column for time stamp - - One column for turbine id - - Many data channel columns - 2. Long: - - One column for time stamp - - One column for variable name - - One column for value - - FLASC format is wide, i.e. - - One column for time stamp - - One column for each channel for each turbine - - Want handling to go between long and wide and semiwide and wide. + Want handling to go between long and wide. """ # Attributes to pickle must be in this list - _metadata = ["name_map", "_user_format"] - - def __init__(self, *args, name_map=None, in_flasc_format=True, user_format="wide", **kwargs): + _metadata = [ + "channel_name_map", + "_channel_name_map_to_user", + "_user_format", + "_long_data_columns", + ] + + def __init__(self, *args, channel_name_map=None, long_data_columns=None, **kwargs): """Initialize the FlascDataFrame class, a subclass of pandas.DataFrame. Args: *args: arguments to pass to the DataFrame constructor - name_map (dict): Dictionary of column names to map from the user format to the FLASC - format, where the key string is the user format and the value string is the FLASC - equivalent. Defaults to None. - in_flasc_format (bool): Whether the data is in FLASC format. Defaults to True. - user_format (str): The format that the user expects the data to be in. Must be one of - 'long', 'semiwide', or 'wide'. Defaults to 'wide'. + channel_name_map (dict): Dictionary of column names to map from the user format to the + FLASC format, where the key string is the user format and the value string is the + FLASC equivalent. Defaults to None. + long_data_columns (dict): Dictionary of column names for long format data. Defaults to + {"variable_column": "variable", "value_column": "value"}. If + not provided, user data format assumed to be wide. **kwargs: keyword arguments to pass to the DataFrame constructor """ super().__init__(*args, **kwargs) + # Check that the time column is present + if "time" not in self.columns: + raise ValueError("Column 'time' must be present in the DataFrame") + # check that name_map dictionary is valid - if name_map is not None: - if not isinstance(name_map, dict): - raise ValueError("name_map must be a dictionary") - if not all(isinstance(k, str) and isinstance(v, str) for k, v in name_map.items()): - raise ValueError("name_map must be a dictionary of strings") - self.name_map = name_map + if channel_name_map is not None: + if not isinstance(channel_name_map, dict): + raise ValueError("channel_name_map must be a dictionary") + if not all( + isinstance(k, str) and isinstance(v, str) for k, v in channel_name_map.items() + ): + raise ValueError("channel_name_map must be a dictionary of strings") + self.channel_name_map = channel_name_map # Save the reversed name_map (to go to user_format) - self._name_map_to_user = ( - {v: k for k, v in name_map.items()} if name_map is not None else None + self._channel_name_map_to_user = ( + {v: k for k, v in channel_name_map.items()} if channel_name_map is not None else None ) - # Set the format - self._in_flasc_format = in_flasc_format - - # Save the user format - if user_format not in ["long", "semiwide", "wide"]: - raise ValueError("user_format must be one of 'long', 'semiwide', 'wide'") - self._user_format = user_format + # Determine the user format + if long_data_columns is None: + self._user_format = "wide" + self._long_data_columns = None + else: + self._user_format = "long" + + # Confirm the long_data_columns is a dictionary with the correct keys + if not isinstance(long_data_columns, dict): + raise ValueError("long_data_columns must be a dictionary") + if not all(col in long_data_columns for col in ["variable_column", "value_column"]): + raise ValueError( + "long_data_columns must contain keys 'variable_column', " "and 'value_column'" + ) + self._long_data_columns = long_data_columns - # I think we should not convert to allow to stay in user format - # # Convert to flasc format if not already - # if not in_flasc_format: - # self.convert_to_flasc_format(inplace=True) - # else: - # self._in_flasc_format = True + @property + def in_flasc_format(self): + """Return True if the data is in FLASC format, False otherwise.""" + if ("time" in self.columns) and ("pow_000" in self.columns): + return True + else: + return False @property def _constructor(self): @@ -84,10 +84,10 @@ def _constructor(self): def __str__(self): """Printout when calling print(df).""" - if self._in_flasc_format: + if self.in_flasc_format: return "FlascDataFrame in FLASC format\n" + super().__str__() else: - return "FlascDataFrame in user format\n" + super().__str__() + return f"FlascDataFrame in user ({self._user_format}) format\n" + super().__str__() @property def n_turbines(self): @@ -101,7 +101,7 @@ def n_turbines(self): def check_flasc_format(self): """Raise an error if the data is not in FLASC format.""" - if not self._in_flasc_format: + if not self.in_flasc_format: raise ValueError( ( "Data must be in FLASC format to perform this operation." @@ -112,71 +112,101 @@ def check_flasc_format(self): pass def convert_to_user_format(self, inplace=False): - """Convert the DataFrame to the format that the user expects, given the name_map.""" + """Convert the DataFrame to the format that the user expects, given the channel_name_map. + + Args: + inplace (bool): If True, modify the DataFrame in place. + If False, return a new DataFrame. + + Returns: + FlascDataFrame: FlascDataFrame in user format if inplace is False, None otherwise. + + """ + # Check if already in user format + if not self.in_flasc_format: + if inplace: + return + else: + return self.copy() + # Convert the format if self._user_format == "long": df_user = self._convert_wide_to_long() - elif self._user_format == "semiwide": - df_user = self._convert_wide_to_semiwide() elif self._user_format == "wide": df_user = self.copy() # In wide to wide conversion, only need to rename the columns - if self.name_map is not None: - df_user.rename(self._name_map_to_user, inplace=inplace) + if self.channel_name_map is not None: + df_user.rename(columns=self._channel_name_map_to_user, inplace=True) # Assign to self or return if inplace: self.__init__( df_user, - name_map=self.name_map, - in_flasc_format=False, - user_format=self._user_format, + channel_name_map=self.channel_name_map, + long_data_columns=self._long_data_columns, ) else: - # Force in flasc format to False - df_user._in_flasc_format = False - return df_user def convert_to_flasc_format(self, inplace=False): - """Convert the DataFrame to the format that FLASC expects.""" + """Convert the DataFrame to the format that FLASC expects. + + Args: + inplace (bool): If True, modify the DataFrame in place. If False, + return a new DataFrame. + + Returns: + FlascDataFrame: FlascDataFrame in FLASC format if inplace is False, None otherwise + + """ + # Check if already in flasc format + if self.in_flasc_format: + if inplace: + return + else: + return self.copy() + # Convert the format if self._user_format == "long": df_flasc = self._convert_long_to_wide() # Should this be assigned to something? - elif self._user_format == "semiwide": - df_flasc = self._convert_semiwide_to_wide() # Should this be assigned to something? elif self._user_format == "wide": df_flasc = self.copy() # In wide to wide conversion, only need to rename the columns - if self.name_map is not None: - df_flasc.rename(columns=self.name_map, inplace=inplace) + if self.channel_name_map is not None: + df_flasc.rename(columns=self.channel_name_map, inplace=True) # Assign to self or return if inplace: self.__init__( df_flasc, - name_map=self.name_map, - in_flasc_format=True, - user_format=self._user_format, + channel_name_map=self.channel_name_map, + long_data_columns=self._long_data_columns, ) else: - # Force in flasc format to True - df_flasc._in_flasc_format = True - return df_flasc def _convert_long_to_wide(self): - """Convert a long format DataFrame to a wide format DataFrame.""" + """Convert a long format DataFrame to a wide format DataFrame. + + Returns: + FlascDataFrame: Wide format FlascDataFrame + """ # Start by converting the variable names df_wide = self.copy() - if df_wide.name_map is not None: - df_wide["variable"] = df_wide["variable"].map(df_wide.name_map) + if df_wide.channel_name_map is not None: + df_wide[self._long_data_columns["variable_column"]] = df_wide[ + self._long_data_columns["variable_column"] + ].map(df_wide.channel_name_map) # Pivot the table so the variable column becomes the column names with time # kept as the first column and value as the values - df_wide = df_wide.pivot(index="time", columns="variable", values="value").reset_index() + df_wide = df_wide.pivot( + index="time", + columns=self._long_data_columns["variable_column"], + values=self._long_data_columns["value_column"], + ).reset_index() # Remove the name df_wide.columns.name = None @@ -184,15 +214,10 @@ def _convert_long_to_wide(self): # Reset the index to make the time column a regular column return FlascDataFrame( df_wide, - name_map=self.name_map, - in_flasc_format=self._in_flasc_format, - user_format=self._user_format, + channel_name_map=self.channel_name_map, + long_data_columns=self._long_data_columns, ) - def _convert_semiwide_to_wide(self): - """Convert a semiwide format DataFrame to a wide format DataFrame.""" - raise NotImplementedError("TO DO") - def _convert_wide_to_long(self): """Convert a wide format DataFrame to a long format DataFrame. @@ -200,38 +225,26 @@ def _convert_wide_to_long(self): FlascDataFrame: Long format FlascDataFrame """ - if "time" not in self.columns: - raise ValueError("Column 'time' must be present in the DataFrame") - - df_long = self.melt(id_vars="time", var_name="variable", value_name="value").sort_values( - ["time", "variable"] - ) + df_long = self.melt( + id_vars="time", + var_name=self._long_data_columns["variable_column"], + value_name=self._long_data_columns["value_column"], + ).sort_values(["time", self._long_data_columns["variable_column"]]) - if self.name_map is not None: - df_long["variable"] = df_long["variable"].map(self._name_map_to_user) + if self.channel_name_map is not None: + df_long[self._long_data_columns["variable_column"]] = df_long[ + self._long_data_columns["variable_column"] + ].map(self._channel_name_map_to_user) # Reset index for cleanliness df_long = df_long.reset_index(drop=True) return FlascDataFrame( df_long, - name_map=self.name_map, - in_flasc_format=self._in_flasc_format, - user_format=self._user_format, + channel_name_map=self.channel_name_map, + long_data_columns=self._long_data_columns, ) - def _convert_wide_to_semiwide(self): - """Convert a wide format DataFrame to a semiwide format DataFrame.""" - if "time" not in self.columns: - raise ValueError("Column 'time' must be present in the DataFrame") - - raise NotImplementedError("TO DO") - # Should have columns: - # time - # turbine_id (as specified by the user) - # variable - # value - def to_feather(self, path, **kwargs): """Raise warning about lost information and save to feather format.""" print( diff --git a/tests/flasc_dataframe_test.py b/tests/flasc_dataframe_test.py index 573779c8..028f18eb 100644 --- a/tests/flasc_dataframe_test.py +++ b/tests/flasc_dataframe_test.py @@ -14,16 +14,10 @@ "ws_001": [9, 9, 9], } -test_name_map = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001", "T2WS": "ws_001"} +test_channel_name_map = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001", "T2WS": "ws_001"} +test_long_columns = {"variable_column": "variable", "value_column": "value"} -test_semi_wide_dict = { - "time": [0, 0, 10, 10, 20, 20], - "turbine_id": [0, 1, 0, 1, 0, 1], - "pow": [0, 50, 100, 150, 200, 250], - "ws": [8, 9, 8, 9, 8, 9], -} - test_long_dict = { "time": [0, 0, 0, 0, 10, 10, 10, 10, 20, 20, 20, 20], "variable": ["T1PWR", "T2PWR", "T1WS", "T2WS"] * 3, @@ -40,7 +34,7 @@ def test_type(): - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) assert isinstance(df, FlascDataFrame) df2 = df.drop(columns="ws_001") # Modifies the dataframe, returns a copy @@ -51,145 +45,206 @@ def test_type(): def test__metadata(): - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) - df._user_format = "long" - df._in_flasc_format = False + df = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) df2 = df.drop(columns="ws_001") # Modifies the dataframe, returns a copy - assert hasattr(df2, "name_map") - assert df2.name_map == test_name_map + assert hasattr(df2, "channel_name_map") + assert df2.channel_name_map == test_channel_name_map assert hasattr(df2, "_user_format") assert df2._user_format == "long" - assert hasattr(df2, "_in_flasc_format") - assert df2._in_flasc_format == True # Resets, since "_in_flasc_format" not in _metadata. - # May want to add "_in_flasc_format" to _metadata in future, but this - # demonstrates functionality + assert hasattr(df2, "in_flasc_format") + assert df2.in_flasc_format == True def test_printout(): - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) - df._in_flasc_format = True - print(df) - print("\n") - df._in_flasc_format = False + df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) + # df._in_flasc_format = True print(df) print("\n") - print(df.head()) # In FLASC format, presumably because .head() returns a reinstantiated copy? + # df._in_flasc_format = False + # print(df) + # print("\n") + + +def test_time_required(): + # Check that the time column is present + with pytest.raises(ValueError): + FlascDataFrame( + {"pow_000": [0, 100, 200], "ws_000": [8, 8, 8]}, channel_name_map=test_channel_name_map + ) def test_check_flasc_format(): - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) # Should not raise an error df.check_flasc_format() # Convert to non-flasc format; should now raise an error - df._user_format = "long" df.convert_to_user_format(inplace=True) with pytest.raises(ValueError): df.check_flasc_format() -def test_convert_to_long_format(): - df_wide = FlascDataFrame(test_wide_dict, name_map=test_name_map) - df_long_test = pd.DataFrame(test_long_dict) +def test_convert_flasc_wide_to_user_wide(): + df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) + df_wide_user = FlascDataFrame(test_wide_user_dict, channel_name_map=test_channel_name_map) - # Test conversion with return - df_wide._user_format = "long" # Should be detected internally - df_wide_copy = df_wide.copy() - df_long = df_wide.convert_to_user_format(inplace=False) + pd.testing.assert_frame_equal(df_wide_flasc.convert_to_user_format(), df_wide_user) - # Test df_long is not in flasc format - assert not df_long._in_flasc_format - # Test returned frame is matched to expected value - pd.testing.assert_frame_equal(df_long, df_long_test) +def test_convert_user_wide_to_flasc_wide(): + df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) + df_wide_user = FlascDataFrame(test_wide_user_dict, channel_name_map=test_channel_name_map) - # Test original frame is unchanged - pd.testing.assert_frame_equal(df_wide, df_wide_copy) + pd.testing.assert_frame_equal(df_wide_user.convert_to_flasc_format(), df_wide_flasc) - # Now test in place conversion - df_wide.convert_to_user_format(inplace=True) - pd.testing.assert_frame_equal(df_wide, df_long_test) - # Assert not in flasc format - assert not df_wide._in_flasc_format +def test_convert_flasc_wide_in_place(): + df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) + df_wide_user = FlascDataFrame(test_wide_user_dict, channel_name_map=test_channel_name_map) - # Now test the back conversion - df_back_to_wide = df_wide.convert_to_flasc_format(inplace=False) + df_wide_flasc.convert_to_user_format(inplace=True) + pd.testing.assert_frame_equal(df_wide_flasc, df_wide_user) - # Resort the columns to match - df_back_to_wide = df_back_to_wide[df_wide_copy.columns] - pd.testing.assert_frame_equal(df_back_to_wide, df_wide_copy) +def test_convert_user_wide_in_place(): + df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) + df_wide_user = FlascDataFrame(test_wide_user_dict, channel_name_map=test_channel_name_map) - # Assert is in flasc format - assert df_back_to_wide._in_flasc_format + df_wide_user.convert_to_flasc_format(inplace=True) + pd.testing.assert_frame_equal(df_wide_user, df_wide_flasc) - # Test in place version - df_wide.convert_to_flasc_format(inplace=True) - # Sort columns to match - df_wide = df_wide[df_wide_copy.columns] +def test_convert_flasc_wide_back_and_forth(): + df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) + df_wide_flasc_copy = df_wide_flasc.copy() - pd.testing.assert_frame_equal(df_wide, df_wide_copy) + df_wide_flasc.convert_to_user_format(inplace=True) + df_wide_flasc.convert_to_flasc_format(inplace=True) + + pd.testing.assert_frame_equal(df_wide_flasc, df_wide_flasc_copy) + + +def test_convert_long_column_names(): + long_col_names = {"variable_column": "VA", "value_column": "VB"} + df_wide_flasc = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=long_col_names + ) + df_long = df_wide_flasc.convert_to_user_format() + + # Check that df_long has 3 columns named "VA", "VB", and "time" + assert "VA" in df_long.columns + assert "VB" in df_long.columns + assert "time" in df_long.columns + + +def test_convert_flasc_to_user_long(): + df_wide_flasc = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + df_long = FlascDataFrame( + test_long_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + + pd.testing.assert_frame_equal(df_wide_flasc.convert_to_user_format(), df_long) + + +def test_convert_user_long_to_flasc(): + df_wide_flasc = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + df_long = FlascDataFrame( + test_long_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + + # Note that the column order is different so fix that + pd.testing.assert_frame_equal( + df_long.convert_to_flasc_format()[df_wide_flasc.columns], df_wide_flasc + ) + + +def test_convert_flasc_long_in_place(): + df_wide_flasc = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + df_long = FlascDataFrame( + test_long_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + + df_wide_flasc.convert_to_user_format(inplace=True) + pd.testing.assert_frame_equal(df_wide_flasc, df_long) + + +def test_convert_user_long_in_place(): + df_wide_flasc = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + df_long = FlascDataFrame( + test_long_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + + df_long.convert_to_flasc_format(inplace=True) + pd.testing.assert_frame_equal(df_long[df_wide_flasc.columns], df_wide_flasc) - # Check operation not allowed if no "time" column - df_wide.drop(columns="time", inplace=True) - with pytest.raises(ValueError): - df_wide.convert_to_user_format(inplace=True) +def test_convert_flasc_long_back_and_forth(): + df_wide_flasc = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns + ) + df_wide_flasc_copy = df_wide_flasc.copy() -def test_convert_to_wide_format(): - # Test wide to wide conversion + df_wide_flasc.convert_to_user_format(inplace=True) + df_wide_flasc.convert_to_flasc_format(inplace=True) - pass + pd.testing.assert_frame_equal(df_wide_flasc[df_wide_flasc_copy.columns], df_wide_flasc_copy) def test_pickle(): df = FlascDataFrame(test_wide_dict) - df.name_map = test_name_map + df.channel_name_map = test_channel_name_map df.to_pickle("test_pickle.pkl") df2 = pd.read_pickle("test_pickle.pkl") assert isinstance(df2, FlascDataFrame) - assert df2.name_map == test_name_map + assert df2.channel_name_map == test_channel_name_map os.remove("test_pickle.pkl") def test_feather(): - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) df.to_feather("test_feather.ftr") df2 = pd.read_feather("test_feather.ftr") # Loaded DataFrame is a pandas DataFrame, not a FlascDataFrame assert not isinstance(df2, FlascDataFrame) assert isinstance(df2, pd.DataFrame) - assert not hasattr(df2, "name_map") + assert not hasattr(df2, "channel_name_map") os.remove("test_feather.ftr") def test_csv(): - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) df.to_csv("test_csv.csv") df2 = pd.read_csv("test_csv.csv") # Loaded DataFrame is a pandas DataFrame, not a FlascDataFrame assert not isinstance(df2, FlascDataFrame) assert isinstance(df2, pd.DataFrame) - assert not hasattr(df2, "name_map") + assert not hasattr(df2, "channel_name_map") os.remove("test_csv.csv") def test_n_turbines(): # Currently, n_turbines based only on number of pow columns - df = FlascDataFrame(test_wide_dict, name_map=test_name_map) + df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) assert df.n_turbines == 2 # Check n_turbines not valid if not in flasc format - df._user_format = "long" df.convert_to_user_format(inplace=True) with pytest.raises(ValueError): df.n_turbines From efb641237a79da48fac530cd9084d06cab0e3000 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 10 Sep 2024 13:30:31 -0600 Subject: [PATCH 3/3] Ensure incomplete map cases well-handled --- flasc/flasc_dataframe.py | 66 ++++++++++----------- tests/flasc_dataframe_test.py | 107 ++++++++++++++++++++++++++++++++-- 2 files changed, 132 insertions(+), 41 deletions(-) diff --git a/flasc/flasc_dataframe.py b/flasc/flasc_dataframe.py index fbf12752..4501f246 100644 --- a/flasc/flasc_dataframe.py +++ b/flasc/flasc_dataframe.py @@ -129,15 +129,16 @@ def convert_to_user_format(self, inplace=False): else: return self.copy() - # Convert the format - if self._user_format == "long": - df_user = self._convert_wide_to_long() - elif self._user_format == "wide": - df_user = self.copy() + # Make a copy of self + df_user = self.copy() + + # Rename the channel columns to user-specified names + if self.channel_name_map is not None: + df_user.rename(columns=self._channel_name_map_to_user, inplace=True) - # In wide to wide conversion, only need to rename the columns - if self.channel_name_map is not None: - df_user.rename(columns=self._channel_name_map_to_user, inplace=True) + # Convert the format to long if _user_format is long + if self._user_format == "long": + df_user = self._convert_wide_to_long(df_user) # Assign to self or return if inplace: @@ -167,15 +168,16 @@ def convert_to_flasc_format(self, inplace=False): else: return self.copy() - # Convert the format + # Make a copy of self + df_flasc = self.copy() + + # Convert back from long if necessary if self._user_format == "long": - df_flasc = self._convert_long_to_wide() # Should this be assigned to something? - elif self._user_format == "wide": - df_flasc = self.copy() + df_flasc = self._convert_long_to_wide(df_flasc) - # In wide to wide conversion, only need to rename the columns - if self.channel_name_map is not None: - df_flasc.rename(columns=self.channel_name_map, inplace=True) + # Rename the channel columns to flasc-naming convention + if self.channel_name_map is not None: + df_flasc.rename(columns=self.channel_name_map, inplace=True) # Assign to self or return if inplace: @@ -187,60 +189,54 @@ def convert_to_flasc_format(self, inplace=False): else: return df_flasc - def _convert_long_to_wide(self): + def _convert_long_to_wide(self, df_): """Convert a long format DataFrame to a wide format DataFrame. + Args: + df_ (FlascDataFrame): Long format FlascDataFrame + Returns: FlascDataFrame: Wide format FlascDataFrame """ - # Start by converting the variable names - df_wide = self.copy() - if df_wide.channel_name_map is not None: - df_wide[self._long_data_columns["variable_column"]] = df_wide[ - self._long_data_columns["variable_column"] - ].map(df_wide.channel_name_map) - # Pivot the table so the variable column becomes the column names with time # kept as the first column and value as the values - df_wide = df_wide.pivot( + df_ = df_.pivot( index="time", columns=self._long_data_columns["variable_column"], values=self._long_data_columns["value_column"], ).reset_index() # Remove the name - df_wide.columns.name = None + df_.columns.name = None # Reset the index to make the time column a regular column return FlascDataFrame( - df_wide, + df_, channel_name_map=self.channel_name_map, long_data_columns=self._long_data_columns, ) - def _convert_wide_to_long(self): + def _convert_wide_to_long(self, df_): """Convert a wide format DataFrame to a long format DataFrame. + Args: + df_ (FlascDataFrame): Wide format FlascDataFrame + Returns: FlascDataFrame: Long format FlascDataFrame """ - df_long = self.melt( + df_ = df_.melt( id_vars="time", var_name=self._long_data_columns["variable_column"], value_name=self._long_data_columns["value_column"], ).sort_values(["time", self._long_data_columns["variable_column"]]) - if self.channel_name_map is not None: - df_long[self._long_data_columns["variable_column"]] = df_long[ - self._long_data_columns["variable_column"] - ].map(self._channel_name_map_to_user) - # Reset index for cleanliness - df_long = df_long.reset_index(drop=True) + df_ = df_.reset_index(drop=True) return FlascDataFrame( - df_long, + df_, channel_name_map=self.channel_name_map, long_data_columns=self._long_data_columns, ) diff --git a/tests/flasc_dataframe_test.py b/tests/flasc_dataframe_test.py index 028f18eb..25e880c0 100644 --- a/tests/flasc_dataframe_test.py +++ b/tests/flasc_dataframe_test.py @@ -15,6 +15,7 @@ } test_channel_name_map = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001", "T2WS": "ws_001"} +test_channel_name_map_incomplete = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001"} test_long_columns = {"variable_column": "variable", "value_column": "value"} @@ -24,6 +25,12 @@ "value": [0, 50, 8, 9, 100, 150, 8, 9, 200, 250, 8, 9], } +test_long_dict_incomplete = { + "time": [0, 0, 0, 0, 10, 10, 10, 10, 20, 20, 20, 20], + "variable": ["T1PWR", "T2PWR", "T1WS", "ws_001"] * 3, + "value": [0, 50, 8, 9, 100, 150, 8, 9, 200, 250, 8, 9], +} + test_wide_user_dict = { "time": [0, 10, 20], "T1PWR": [0, 100, 200], @@ -32,6 +39,31 @@ "T2WS": [9, 9, 9], } +test_wide_user_dict_incomplete = { + "time": [0, 10, 20], + "T1PWR": [0, 100, 200], + "T1WS": [8, 8, 8], + "T2PWR": [50, 150, 250], + "ws_001": [9, 9, 9], +} + + +def assert_equal_except_row_col_order(df1, df2): + # Sort the columns + df_1_c = df1.sort_index(axis=1) + df_2_c = df2.sort_index(axis=1) + + # If "variable" is a column, sort by ['time', 'variable'] + if "variable" in df_1_c.columns: + df_1_c = df_1_c.sort_values(by=["time", "variable"]).reset_index(drop=True) + df_2_c = df_2_c.sort_values(by=["time", "variable"]).reset_index(drop=True) + + else: + df_1_c = df_1_c.sort_values(by=["time"]).reset_index(drop=True) + df_2_c = df_2_c.sort_values(by=["time"]).reset_index(drop=True) + + pd.testing.assert_frame_equal(df_1_c, df_2_c) + def test_type(): df = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) @@ -94,6 +126,20 @@ def test_convert_flasc_wide_to_user_wide(): pd.testing.assert_frame_equal(df_wide_flasc.convert_to_user_format(), df_wide_user) +def test_convert_flasc_wide_to_user_wide_incomplete(): + # Test incomplete channel name map + df_wide_flasc_incomplete = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map_incomplete + ) + df_wide_user_incomplete = FlascDataFrame( + test_wide_user_dict_incomplete, channel_name_map=test_channel_name_map_incomplete + ) + + pd.testing.assert_frame_equal( + df_wide_flasc_incomplete.convert_to_user_format(), df_wide_user_incomplete + ) + + def test_convert_user_wide_to_flasc_wide(): df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) df_wide_user = FlascDataFrame(test_wide_user_dict, channel_name_map=test_channel_name_map) @@ -101,6 +147,20 @@ def test_convert_user_wide_to_flasc_wide(): pd.testing.assert_frame_equal(df_wide_user.convert_to_flasc_format(), df_wide_flasc) +def test_convert_user_wide_to_flasc_wide_incomplete(): + # Test incomplete channel name map + df_wide_flasc_incomplete = FlascDataFrame( + test_wide_dict, channel_name_map=test_channel_name_map_incomplete + ) + df_wide_user_incomplete = FlascDataFrame( + test_wide_user_dict_incomplete, channel_name_map=test_channel_name_map_incomplete + ) + + pd.testing.assert_frame_equal( + df_wide_user_incomplete.convert_to_flasc_format(), df_wide_flasc_incomplete + ) + + def test_convert_flasc_wide_in_place(): df_wide_flasc = FlascDataFrame(test_wide_dict, channel_name_map=test_channel_name_map) df_wide_user = FlascDataFrame(test_wide_user_dict, channel_name_map=test_channel_name_map) @@ -148,7 +208,25 @@ def test_convert_flasc_to_user_long(): test_long_dict, channel_name_map=test_channel_name_map, long_data_columns=test_long_columns ) - pd.testing.assert_frame_equal(df_wide_flasc.convert_to_user_format(), df_long) + assert_equal_except_row_col_order(df_wide_flasc.convert_to_user_format(), df_long) + + +def test_convert_flasc_to_user_long_incomplete(): + # Test incomplete channel name map + df_wide_flasc_incomplete = FlascDataFrame( + test_wide_dict, + channel_name_map=test_channel_name_map_incomplete, + long_data_columns=test_long_columns, + ) + df_long_incomplete = FlascDataFrame( + test_long_dict_incomplete, + channel_name_map=test_channel_name_map_incomplete, + long_data_columns=test_long_columns, + ) + + assert_equal_except_row_col_order( + df_wide_flasc_incomplete.convert_to_user_format(), df_long_incomplete + ) def test_convert_user_long_to_flasc(): @@ -160,8 +238,25 @@ def test_convert_user_long_to_flasc(): ) # Note that the column order is different so fix that - pd.testing.assert_frame_equal( - df_long.convert_to_flasc_format()[df_wide_flasc.columns], df_wide_flasc + assert_equal_except_row_col_order(df_long.convert_to_flasc_format(), df_wide_flasc) + + +def test_convert_user_long_to_flasc_incomplete(): + # Test incomplete channel name map + df_wide_flasc_incomplete = FlascDataFrame( + test_wide_dict, + channel_name_map=test_channel_name_map_incomplete, + long_data_columns=test_long_columns, + ) + df_long_incomplete = FlascDataFrame( + test_long_dict_incomplete, + channel_name_map=test_channel_name_map_incomplete, + long_data_columns=test_long_columns, + ) + + # Note that the column order is different so fix that + assert_equal_except_row_col_order( + df_long_incomplete.convert_to_flasc_format(), df_wide_flasc_incomplete ) @@ -174,7 +269,7 @@ def test_convert_flasc_long_in_place(): ) df_wide_flasc.convert_to_user_format(inplace=True) - pd.testing.assert_frame_equal(df_wide_flasc, df_long) + assert_equal_except_row_col_order(df_wide_flasc, df_long) def test_convert_user_long_in_place(): @@ -186,7 +281,7 @@ def test_convert_user_long_in_place(): ) df_long.convert_to_flasc_format(inplace=True) - pd.testing.assert_frame_equal(df_long[df_wide_flasc.columns], df_wide_flasc) + assert_equal_except_row_col_order(df_long, df_wide_flasc) def test_convert_flasc_long_back_and_forth(): @@ -198,7 +293,7 @@ def test_convert_flasc_long_back_and_forth(): df_wide_flasc.convert_to_user_format(inplace=True) df_wide_flasc.convert_to_flasc_format(inplace=True) - pd.testing.assert_frame_equal(df_wide_flasc[df_wide_flasc_copy.columns], df_wide_flasc_copy) + assert_equal_except_row_col_order(df_wide_flasc, df_wide_flasc_copy) def test_pickle():