update func~ remove_valence

GGNoWayBack · Mar 5, 2024 · 269e284 · 269e284
1 parent c77148c
commit 269e284
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 12 deletions.
diff --git a/cathodedataextractor/nlp/cner.py b/cathodedataextractor/nlp/cner.py
@@ -82,16 +82,34 @@ def normalize(text):
         # Remove valence
         def remove_valence(cem, sign='+', start=0):
             idx = cem.find(sign, start)
-            if idx > -1:
-                window = cem[idx - 1: idx + 2]
-                if window and window.index(sign):
-                    if window[-1] in ['x', 'y', 'z', 'δ']:
-                        return remove_valence(cem, start=idx + 2)
-                    elif window[0].isdigit():
-                        return remove_valence(cem[:idx - 1] + cem[idx + 1:], start=idx - 1)
-                    else:
-                        return remove_valence(cem[:idx] + cem[idx + 1:], start=idx)
-            return cem
+            if idx == -1:
+                oxygen_idx = cem.find("O", start)
+                while oxygen_idx > -1:
+                    valence_l = valence_r = None
+                    for i in range(oxygen_idx + 1, len(cem)):
+                        if valence_l is None and cem[i].isdigit():  # 1
+                            valence_l = i
+                        elif cem[i] == "-":  # 2
+                            valence_r = i
+                        elif cem[i] == " ":  # 1 2 3
+                            continue
+                        elif valence_r is not None and cem[i].isdigit():  # 3
+                            valence_r = i
+                            break
+                        else:
+                            valence_r = None
+                            break
+                    if valence_l and valence_r:
+                        cem = cem[:valence_l] + cem[valence_r:]
+                    oxygen_idx = cem.find("O", oxygen_idx + 1)
+                return cem
+            window = cem[idx - 1: idx + 2]
+            if window[-1] in VAR:
+                return remove_valence(cem, start=idx + 2)
+            elif window[0].isdigit():
+                return remove_valence(cem[:idx - 1] + cem[idx + 1:], start=idx - 1)
+            else:
+                return remove_valence(cem[:idx] + cem[idx + 1:], start=idx)
 
         cem = remove_valence(cem)
 

diff --git a/cathodedataextractor/nlp/modi_cde_nlp.py b/cathodedataextractor/nlp/modi_cde_nlp.py
@@ -6,7 +6,7 @@
 """
 # Split on plus surrounded by any letter or number provided no brackets
 change_cem = cem.SPECIALS.index('^([^\(\)]+\w)\+(\w[^\(\)]+)$')
-cem.SPECIALS[change_cem] = '^([^\(\)]+\w)\+([^xyx][^\(\)]+)'
+cem.SPECIALS[change_cem] = '^([^()]+ \w)\+([^xyx][^()]+)'
 
 """
 nlp/tokenize

diff --git a/cathodedataextractor/parse/regex_functions.py b/cathodedataextractor/parse/regex_functions.py
@@ -6,7 +6,7 @@
 
 OPEN, CLOSE = {'(', '[', '{'}, {')', ']', '}'}
 OP_CL = {'(': ')', '[': ']', '{': '}'}
-MIN_LENGTH = 8  # The threshold determines whether the return value is meaningful
+MIN_LENGTH = 11  # The threshold determines whether the return value is meaningful
 
 
 @lru_cache(None)

diff --git a/cathodedataextractor/parse/regex_pattern.py b/cathodedataextractor/parse/regex_pattern.py
@@ -8,6 +8,8 @@
 
 ATTRIBUTE_PROMPT = ['voltage', 'mAhg-1', 'V', 'capacit', 'mAg-1', 'C', 'Ag-1', 'mAg-1']
 
+VAR = {'x', 'y', 'z', 'δ'}
+
 ELEMENTS = ["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K",
             "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
             "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I",