Misc improvements (#180)

etsap-TIMES · Feb 13, 2024 · 3720c7e · 3720c7e
1 parent 8d066d0
commit 3720c7e
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 69 deletions.
diff --git a/xl2times/config/times_mapping.txt b/xl2times/config/times_mapping.txt
@@ -2,7 +2,7 @@ ALL_REG[ALL_REG] = AllRegions(Region)
 ALL_TS[ALL_TS] = TimeSlices(TS)
 B[DATAYEAR,VALUE] = TimePeriods(Year,B)
 COM[COM] = Commodities(Commodity)
-COM_DESC[REG,COM,TEXT] = Commodities(Region,Commodity,CommDesc)
+COM_DESC[REG,COM,TEXT] = Commodities(Region,Commodity,Description)
 COM_GMAP[REG,COM_GRP,COM] = CommodityGroupMap(Region,CommodityGroup,Commodity)
 COM_GRP[COM_GRP] = CommodityGroups(CommodityGroup)
 COM_LIM[REG,COM,BD] = Commodities(Region,Commodity,LimType)
@@ -22,7 +22,7 @@ NRG_TMAP[REG,NRG_TYPE,COM] = Commodities(Region,Ctype,Commodity)
 PASTYEAR[DATAYEAR,TEXT] = PastYears(Year,Year)
 PRC[PRC] = Processes(Process)
 PRC_ACTUNT[REG,PRC,COM_GRP,UNITS] = Processes(Region,Process,PrimaryCG,Tact)
-PRC_DESC[REG,PRC,TEXT] = Processes(Region,Process,TechDesc)
+PRC_DESC[REG,PRC,TEXT] = Processes(Region,Process,Description)
 PRC_DSCNCAP[REG,PRC] = Attributes(Region,Process, Attribute:NCAP_DISC)
 PRC_MAP[REG,PRC_GRP,PRC] = Processes(Region,Sets,Process)
 PRC_TSL[REG,PRC,TSLVL] = Processes(Region,Process,Tslvl)

diff --git a/xl2times/config/veda-tags.json b/xl2times/config/veda-tags.json
@@ -82,7 +82,7 @@
         "aliases": [
           "description"
         ],
-        "use_name": "commdesc",
+        "use_name": "description",
         "row_ignore_symbol": [
           "\\I:",
           "*"
@@ -241,7 +241,7 @@
         "aliases": [
           "description"
         ],
-        "use_name": "techdesc",
+        "use_name": "description",
         "row_ignore_symbol": [
           "\\I:",
           "*"

diff --git a/xl2times/transforms.py b/xl2times/transforms.py
@@ -306,7 +306,7 @@ def process_flexible_import_tables(
         ),
         "region": model.internal_regions,
         "currency": utils.single_column(tables, datatypes.Tag.currencies, "currency"),
-        "other_indexes": {"INPUT", "OUTPUT"},
+        "other_indexes": {"INPUT", "OUTPUT", "DEMO", "DEMI"},
     }
 
     def get_colname(value):
@@ -1220,7 +1220,7 @@ def expand_pcg_from_suffix(df):
                     "sets",
                     "region",
                     "process",
-                    "techdesc",
+                    "description",
                     "tact",
                     "tcap",
                     "tslvl",
@@ -1478,7 +1478,7 @@ def generate_dummy_processes(
 
         process_declarations = pd.DataFrame(
             dummy_processes,
-            columns=["sets", "process", "techdesc", "tact", "tcap", "primarycg"],
+            columns=["sets", "process", "description", "tact", "tcap", "primarycg"],
         )
 
         tables.append(
@@ -1492,7 +1492,7 @@ def generate_dummy_processes(
             )
         )
 
-        process_data_specs = process_declarations[["process", "techdesc"]].copy()
+        process_data_specs = process_declarations[["process", "description"]].copy()
         # Use this as default activity cost for dummy processes
         # TODO: Should this be included in settings instead?
         process_data_specs["ACTCOST"] = 1111
@@ -1830,7 +1830,7 @@ def get_matching_processes(row, dictionary):
     ]:
         if row[col] is not None:
             matching_processes = intersect(
-                matching_processes, filter_by_pattern(dictionary[key], row[col])
+                matching_processes, filter_by_pattern(dictionary[key], row[col].upper())
             )
     if matching_processes is not None and any(matching_processes.duplicated()):
         raise ValueError("duplicated")
@@ -1846,70 +1846,67 @@ def get_matching_commodities(row, dictionary):
     ]:
         if row[col] is not None:
             matching_commodities = intersect(
-                matching_commodities, filter_by_pattern(dictionary[key], row[col])
+                matching_commodities,
+                filter_by_pattern(dictionary[key], row[col].upper()),
             )
     return matching_commodities
 
 
+def df_indexed_by_col(df, col):
+    # Set df index using an existing column; make index is uppercase
+    df = df.dropna().drop_duplicates()
+    index = df[col].str.upper()
+    df = df.set_index(index).rename_axis("index")
+
+    if len(df.columns) > 1:
+        df = df.drop(columns=col)
+    return df
+
+
 def generate_topology_dictionary(
     tables: Dict[str, DataFrame], model: datatypes.TimesModel
 ) -> Dict[str, DataFrame]:
     # We need to be able to fetch processes based on any combination of name, description, set, comm-in, or comm-out
     # So we construct tables whose indices are names, etc. and use pd.filter
 
     dictionary = dict()
+    pros = model.processes
+    coms = model.commodities
+    pros_and_coms = tables[datatypes.Tag.fi_t]
 
-    dictionary["processes_by_name"] = (
-        model.processes[["process"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("process", drop=False)
-        .rename_axis("index")
-    )
-    dictionary["processes_by_desc"] = (
-        model.processes[["process", "techdesc"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("techdesc")
-    )
-    dictionary["processes_by_sets"] = (
-        model.processes[["process", "sets"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("sets")
-    )
-    processes_and_commodities = tables[datatypes.Tag.fi_t]
-    dictionary["processes_by_comm_in"] = (
-        processes_and_commodities[["process", "commodity-in"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("commodity-in")
-    )
-    dictionary["processes_by_comm_out"] = (
-        processes_and_commodities[["process", "commodity-out"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("commodity-out")
-    )
-    dictionary["commodities_by_name"] = (
-        model.commodities[["commodity"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("commodity", drop=False)
-        .rename_axis("index")
-    )
-    dictionary["commodities_by_desc"] = (
-        model.commodities[["commodity", "commdesc"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("commdesc")
-    )
-    dictionary["commodities_by_sets"] = (
-        model.commodities[["commodity", "csets"]]
-        .dropna()
-        .drop_duplicates()
-        .set_index("csets")
-    )
+    dict_info = [
+        {"key": "processes_by_name", "df": pros[["process"]], "col": "process"},
+        {
+            "key": "processes_by_desc",
+            "df": pros[["process", "description"]],
+            "col": "description",
+        },
+        {"key": "processes_by_sets", "df": pros[["process", "sets"]], "col": "sets"},
+        {
+            "key": "processes_by_comm_in",
+            "df": pros_and_coms[["process", "commodity-in"]],
+            "col": "commodity-in",
+        },
+        {
+            "key": "processes_by_comm_out",
+            "df": pros_and_coms[["process", "commodity-out"]],
+            "col": "commodity-out",
+        },
+        {"key": "commodities_by_name", "df": coms[["commodity"]], "col": "commodity"},
+        {
+            "key": "commodities_by_desc",
+            "df": coms[["commodity", "description"]],
+            "col": "description",
+        },
+        {
+            "key": "commodities_by_sets",
+            "df": coms[["commodity", "csets"]],
+            "col": "csets",
+        },
+    ]
+
+    for entry in dict_info:
+        dictionary[entry["key"]] = df_indexed_by_col(entry["df"], entry["col"])
 
     return dictionary
 

diff --git a/xl2times/utils.py b/xl2times/utils.py
@@ -186,22 +186,17 @@ def create_regexp(pattern):
         pattern = remove_negative_patterns(pattern)
     if len(pattern) == 0:
         return re.compile(pattern)  # matches everything
-    # escape special characters
-    # Backslash must come first
-    special = "\\.|^$+()[]{}"
-    for c in special:
-        pattern = pattern.replace(c, "\\" + c)
     # Handle VEDA wildcards
-    pattern = pattern.replace("*", ".*").replace("?", ".").replace(",", "|")
+    pattern = pattern.replace("*", ".*").replace("?", ".").replace(",", r"$|^")
     # Do not match substrings
-    pattern = "^" + pattern + "$"
+    pattern = rf"^{pattern}$"
     return re.compile(pattern)
 
 
 def create_negative_regexp(pattern):
     pattern = remove_positive_patterns(pattern)
     if len(pattern) == 0:
-        pattern = "^$"  # matches nothing
+        pattern = r"^$"  # matches nothing
     return create_regexp(pattern)