Merge pull request #809 from VisLab/develop

Added initial version of the sequence map
hed-standard · Dec 14, 2023 · c25cf46 · c25cf46
2 parents 18626f2 + 7ba9438
commit c25cf46
Show file tree

Hide file tree

Showing 4 changed files with 175 additions and 3 deletions.
diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py
@@ -59,7 +59,9 @@ def make_template(self, additional_cols=None, show_counts=True):
 
         Parameters:
             additional_cols (list or None): Optional list of additional columns to append to the returned dataframe.
-            show_counts (bool): If true, number of times each key combination appears is in first column
+            show_counts (bool): If True, number of times each key combination appears is in first column and 
+                                values are sorted in descending order by 
+            
         Returns:
             DataFrame:  A dataframe containing the template.
 
@@ -79,6 +81,7 @@ def make_template(self, additional_cols=None, show_counts=True):
             df[additional_cols] = 'n/a'
         if show_counts:
             df.insert(0, 'key_counts', self._get_counts())
+            df.sort_values(by=['key_counts'], inplace=True, ignore_index=True, ascending=False)
         return df
 
     def _get_counts(self):
@@ -142,7 +145,7 @@ def resort(self):
         for index, row in self.col_map.iterrows():
             key_hash = get_row_hash(row, self.key_cols)
             self.map_dict[key_hash] = index
-
+            
     def update(self, data, allow_missing=True):
         """ Update the existing map with information from data.
 

diff --git a/hed/tools/analysis/sequence_map.py b/hed/tools/analysis/sequence_map.py
@@ -0,0 +1,118 @@
+""" A map of containing the number of times a particular sequence of values in a column of an event file. """
+
+
+import pandas as pd
+from hed.tools.util.data_util import get_key_hash
+
+
+class SequenceMap:
+    """ A map of unique sequences of column values of a particular length appear in an event file.
+
+    Attributes:
+        
+        name (str):       An optional name of this remap for identification purposes.
+
+    Notes: This mapping converts all columns in the mapping to strings.
+    The remapping does not support other types of columns.
+
+    """
+    def __init__(self, codes=None, name=''):
+        """ Information for setting up the maps.
+
+        Parameters:
+            codes (list or None): If None use all codes, otherwise only include listed codes in the map.
+            name (str):          Name associated with this remap (usually a pathname of the events file).
+
+        """
+
+        self.codes = codes
+        self.name = name
+        self.node_counts = {}
+        self.edges = {}     # map of keys to n-element sequences
+        self.edge_counts = {}  # Keeps a running count of the number of times a key appears in the data
+
+    @property
+
+    def __str__(self):
+        node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()]
+        node_str = (" ").join(node_counts)
+        return node_str
+        # temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"]
+        # for index, row in self.col_map.iterrows():
+        #     key_hash = get_row_hash(row, self.columns)
+        #     temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}")
+        # return "\n".join(temp_list)
+
+    def dot_str(self, group_spec={}):
+        base = 'digraph g { \n'
+        node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
+        if node_list:
+            base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n"
+        if group_spec:
+            for group, spec in group_spec.items():
+                group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]]
+                if group_list:
+                    spec_color = spec["color"]
+                    if spec_color[0] == '#':
+                        spec_color = f'"{spec_color}"'
+                    base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \
+                           '\n'.join(group_list) + '\n}\n'
+        edge_list = [f"{value[0]} -> {value[1]} [label={str(self.edge_counts[key])}];" 
+                     for key, value in self.edges.items()]
+        dot_str = base +  ("\n").join(edge_list) + "}\n"
+        return dot_str
+
+    # def resort(self):
+    #     """ Sort the col_map in place by the key columns. """
+    #     self.col_map.sort_values(by=self.key_cols, inplace=True, ignore_index=True)
+    #     for index, row in self.col_map.iterrows():
+    #         key_hash = get_row_hash(row, self.key_cols)
+    #         self.map_dict[key_hash] = index
+
+    def update(self, data):
+        """ Update the existing map with information from data.
+
+        Parameters:
+            data (Series):     DataFrame or filename of an events file or event map.
+            allow_missing (bool):        If true allow missing keys and add as n/a columns.
+
+        :raises HedFileError:
+            - If there are missing keys and allow_missing is False.
+
+        """
+        filtered = self.prep(data)
+        if self.codes:
+            mask = filtered.isin(self.codes)
+            filtered = filtered[mask]
+        for index, value in filtered.items():
+            if value not in self.node_counts:
+                self.node_counts[value] = 1
+            else:
+                self.node_counts[value] = self.node_counts[value] + 1
+            if index + 1 >= len(filtered):
+                break
+            key_list = filtered[index:index+2].tolist()
+            key = get_key_hash(key_list)
+            if key in self.edges:
+                self.edge_counts[key] = self.edge_counts[key] + 1
+            else:
+                self.edges[key] = key_list
+                self.edge_counts[key] = 1
+
+    @staticmethod
+    def prep(data):
+        """ Remove quotes from the specified columns and convert to string.
+
+        Parameters:
+            data (Series):   Dataframe to process by removing quotes.
+            
+        Returns: Series
+        Notes:
+            - Replacement is done in place.
+        """
+
+        filtered = data.astype(str)
+        filtered.fillna('n/a').astype(str)
+        filtered = filtered.str.replace('"', '')
+        filtered = filtered.str.replace("'", "")
+        return filtered
diff --git a/tests/tools/analysis/test_key_map.py b/tests/tools/analysis/test_key_map.py
@@ -59,8 +59,9 @@ def test_make_template(self):
         df1 = t_map.make_template(show_counts=False)
         self.assertIsInstance(df1, pd.DataFrame, "make_template should return a DataFrame")
         self.assertEqual(len(df1.columns), 1, "make_template should return 1 column single key, no additional columns")
-        df2 = t_map.make_template()
+        df2 = t_map.make_template(show_counts=True)
         self.assertEqual(len(df2.columns), 2, "make_template returns an extra column for counts")
+
         t_map2 = KeyMap(['event_type', 'type'])
         t_map2.update(self.stern_test1_path)
         df3 = t_map2.make_template()

diff --git a/tests/tools/analysis/test_sequence_map.py b/tests/tools/analysis/test_sequence_map.py
@@ -0,0 +1,50 @@
+import unittest
+import os
+import pandas as pd
+from hed.errors.exceptions import HedFileError
+from hed.tools.analysis.sequence_map import SequenceMap
+from hed.tools.util.data_util import get_new_dataframe
+from hed.tools.util.io_util import get_file_list
+
+
+class Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # curation_base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data/remodel_tests')
+        base_path = ''
+        cls.events_path = os.path.realpath(base_path + '/sub-01/ses-01/eeg/sub-01_ses-01_task-DriveRandomSound_run-1_events.tsv')
+
+
+    def test_constructor(self):
+        codes1 = ['1111', '1112', '1121', '1122', '1131', '1132', '1141', 
+                 '1142', '1311', '1312', '1321', '1322',  
+                  '4210', '4220', '4230', '4311', '4312']
+
+        smap1 = SequenceMap(codes=codes1)
+        self.assertIsInstance(smap1, SequenceMap)
+        # df = get_new_dataframe(self.events_path)
+        # data = df['value']
+        # smap1.update(data)
+        # #print(f"{smap1.__str__}")
+        # print("to here")
+
+    def test_update(self):
+        codes1 = ['1111', '1121', '1131', '1141', '1311', '1321',
+                  '4210', '4220', '4230', '4311']
+        codes1 = ['1111', '1121', '1131', '1141', '1311', '4311']
+        #codes1 = ['1111', '1121', '1131', '1141', '1311']
+        smap1 = SequenceMap(codes=codes1)
+        self.assertIsInstance(smap1, SequenceMap)
+        # df = get_new_dataframe(self.events_path)
+        # data = df['value']
+        # smap1.update(data)
+        # print(f"{smap1.dot_str()}")
+        # group_spec = {"stimulus": {"color": "#FFAAAA", "nodes": ["1111", "1121", "1131", "1141", "1311"]}}
+        # print(f"{smap1.dot_str(group_spec=group_spec)}")
+        # 
+    def test_str(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()