Skip to content

Commit

Permalink
update func~ remove_valence
Browse files Browse the repository at this point in the history
  • Loading branch information
GGNoWayBack committed Mar 5, 2024
1 parent c77148c commit 269e284
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 12 deletions.
38 changes: 28 additions & 10 deletions cathodedataextractor/nlp/cner.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,34 @@ def normalize(text):
# Remove valence
def remove_valence(cem, sign='+', start=0):
idx = cem.find(sign, start)
if idx > -1:
window = cem[idx - 1: idx + 2]
if window and window.index(sign):
if window[-1] in ['x', 'y', 'z', 'δ']:
return remove_valence(cem, start=idx + 2)
elif window[0].isdigit():
return remove_valence(cem[:idx - 1] + cem[idx + 1:], start=idx - 1)
else:
return remove_valence(cem[:idx] + cem[idx + 1:], start=idx)
return cem
if idx == -1:
oxygen_idx = cem.find("O", start)
while oxygen_idx > -1:
valence_l = valence_r = None
for i in range(oxygen_idx + 1, len(cem)):
if valence_l is None and cem[i].isdigit(): # 1
valence_l = i
elif cem[i] == "-": # 2
valence_r = i
elif cem[i] == " ": # 1 2 3
continue
elif valence_r is not None and cem[i].isdigit(): # 3
valence_r = i
break
else:
valence_r = None
break
if valence_l and valence_r:
cem = cem[:valence_l] + cem[valence_r:]
oxygen_idx = cem.find("O", oxygen_idx + 1)
return cem
window = cem[idx - 1: idx + 2]
if window[-1] in VAR:
return remove_valence(cem, start=idx + 2)
elif window[0].isdigit():
return remove_valence(cem[:idx - 1] + cem[idx + 1:], start=idx - 1)
else:
return remove_valence(cem[:idx] + cem[idx + 1:], start=idx)

cem = remove_valence(cem)

Expand Down
2 changes: 1 addition & 1 deletion cathodedataextractor/nlp/modi_cde_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""
# Split on plus surrounded by any letter or number provided no brackets
change_cem = cem.SPECIALS.index('^([^\(\)]+\w)\+(\w[^\(\)]+)$')
cem.SPECIALS[change_cem] = '^([^\(\)]+\w)\+([^xyx][^\(\)]+)'
cem.SPECIALS[change_cem] = '^([^()]+ \w)\+([^xyx][^()]+)'

"""
nlp/tokenize
Expand Down
2 changes: 1 addition & 1 deletion cathodedataextractor/parse/regex_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

OPEN, CLOSE = {'(', '[', '{'}, {')', ']', '}'}
OP_CL = {'(': ')', '[': ']', '{': '}'}
MIN_LENGTH = 8 # The threshold determines whether the return value is meaningful
MIN_LENGTH = 11 # The threshold determines whether the return value is meaningful


@lru_cache(None)
Expand Down
2 changes: 2 additions & 0 deletions cathodedataextractor/parse/regex_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

ATTRIBUTE_PROMPT = ['voltage', 'mAhg-1', 'V', 'capacit', 'mAg-1', 'C', 'Ag-1', 'mAg-1']

VAR = {'x', 'y', 'z', 'δ'}

ELEMENTS = ["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K",
"Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I",
Expand Down

0 comments on commit 269e284

Please sign in to comment.