Skip to content

Commit

Permalink
Remove Spacy completely from negation #39 #11
Browse files Browse the repository at this point in the history
  • Loading branch information
woodthom2 committed Jul 19, 2024
1 parent cda1335 commit 1fd9b12
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 70 deletions.
162 changes: 94 additions & 68 deletions src/harmony/matching/negator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,118 +25,124 @@
'''

import spacy
import re

nlp = spacy.blank("en")
re_word = re.compile(r'(?i)(\S+)')


def get_change_en(doc) -> dict:
def tokenise(text):
tokens = list(re_word.finditer(text))

return tokens


def get_change_en(token_texts_lower: list) -> dict:
"""
Identify how to change an English sentence from positive to negative or vice versa.
:param doc:
:return:
"""
for tok in doc:
if tok.text.lower() in {"always", "rather", "really", "very", "totally", "utterly", "absolutely", "completely",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"always", "rather", "really", "very", "totally", "utterly", "absolutely", "completely",
"frequently", "often", "sometimes", "generally", "usually"}:
return {tok.i: ("replace", "never")}
return {token_idx: ("replace", "never")}
# Team Cheemu: added these if statements to handle negative contractions (eg. can't, won't, shan't)
if tok.text.lower() == "ca" and doc[tok.i + 1].text.lower() == "n't":
return {tok.i: ("replace", "can"), tok.i + 1: ("replace", "")}
if tok.text.lower() == "wo" and doc[tok.i + 1].text.lower() == "n't":
return {tok.i: ("replace", "will"), tok.i + 1: ("replace", "")}
if tok.text.lower() == "sha" and doc[tok.i + 1].text.lower() == "n't":
return {tok.i: ("replace", "shall"), tok.i + 1: ("replace", "")}
if tok.text.lower() in {"never", "not", "n't"}:
return {tok.i: ("replace", "")}
if tok.text.lower() in {"cannot"}:
return {tok.i: ("replace", "can")}
if token_text_lower == "can't":
return {token_idx: ("replace", "can")}
if token_text_lower == "won't":
return {token_idx: ("replace", "will")}
if token_text_lower == "shan't":
return {token_idx: ("replace", "shall")}
if token_text_lower in {"never", "not", "don't"}:
return {token_idx: ("replace", "")}
if token_text_lower in {"cannot"}:
return {token_idx: ("replace", "can")}
result = {}
for tok in doc:
if tok.text.lower() in {"is", "are", "am", "are", "was", "were", "has", "have", "had"}:
result[tok.i] = "insert_after", "not"
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"is", "are", "am", "are", "was", "were", "has", "have", "had"}:
result[token_idx] = "insert_after", "not"
if len(result) > 0:
return result
# print ("fallback", doc)
return {0: ("insert_before", "never")}


def get_change_pt(doc) -> dict:
def get_change_pt(token_texts_lower: list) -> dict:
"""
Identify how to change a Portuguese sentence from positive to negative or vice versa.
:param doc:
:return:
"""
for tok in doc:
if tok.text.lower() in {"sempre", "bastante", "realmente", "muito", "totalmente", "totalmente", "absolutamente",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"sempre", "bastante", "realmente", "muito", "totalmente", "totalmente", "absolutamente",
"completamente",
"frequentemente", "frequentemente", "vezes", "geralmente", "geralmente"}:
return {tok.i: ("replace", "nunca")}
if tok.text.lower() in {"nunca", "jamais", "nem", "não"}:
return {tok.i: ("replace", "")}
return {token_idx: ("replace", "nunca")}
if token_text_lower in {"nunca", "jamais", "nem", "não"}:
return {token_idx: ("replace", "")}
result = {}
if len(result) > 0:
return result
return {0: ("insert_before", "não")}


def get_change_es(doc) -> dict:
def get_change_es(token_texts_lower: list) -> dict:
"""
# Team Cheemu: Identify how to change a Spanish sentence from positive to negative or vice versa.
:param doc:
:return:
"""
for tok in doc:
if tok.text.lower() in {"siempre", "bastante", "realmente", "muy", "mucho", "totalmente", "totalmente",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"siempre", "bastante", "realmente", "muy", "mucho", "totalmente", "totalmente",
"absolutamente",
"completamente",
"frecuentemente", "frequentemente", "veces"}:
return {tok.i: ("replace", "nunca")}
if tok.text.lower() in {"nunca", "jamás", "ni", "no"}:
return {tok.i: ("replace", "")}
return {token_idx: ("replace", "nunca")}
if token_text_lower in {"nunca", "jamás", "ni", "no"}:
return {token_idx: ("replace", "")}
result = {}
if len(result) > 0:
return result
return {0: ("insert_before", "no")}


def get_change_it(doc) -> dict:
def get_change_it(token_texts_lower: list) -> dict:
"""
# Team Cheemu: Identify how to change an Italian sentence from positive to negative or vice versa.
:param doc:
:return:
"""
for tok in doc:
if tok.text.lower() in {"sempre", "abbastanza", "realmente", "davvero", "veramente", "molto", "molta", "molti",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"sempre", "abbastanza", "realmente", "davvero", "veramente", "molto", "molta", "molti",
"molte", "totalmente", "assolutamente",
"completamente",
"frequentemente", "qualche volta", "a volte", "ogni tanto"}:
return {tok.i: ("replace", "mai")}
if tok.text.lower() in {"mai", "né", "non", "nessuno", "nulla", "niente"}:
return {tok.i: ("replace", "")}
return {token_idx: ("replace", "mai")}
if token_text_lower in {"mai", "né", "non", "nessuno", "nulla", "niente"}:
return {token_idx: ("replace", "")}
result = {}
for tok in doc:
if tok.text.lower() in {"è", "sono", "ero", "erano", "avevano", "avevo", "ho avuto", "sono stato", "sono stata",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"è", "sono", "ero", "erano", "avevano", "avevo", "ho avuto", "sono stato", "sono stata",
"sono stati", "siamo stati", "sono state"}:
result[tok.i] = "insert_before", "non"
result[token_idx] = "insert_before", "non"
if len(result) > 0:
return result
return {0: ("insert_before", "non")}


def get_change_de(doc) -> dict:
def get_change_de(token_texts_lower: list) -> dict:
"""
# Team Cheemu: Identify how to change a German sentence from positive to negative or vice versa.
:param doc:
:return:
"""
for tok in doc:
if tok.text.lower() in {"immer", "ziemlich", "wirklich", "sehr", "viel", "total", "absolut",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"immer", "ziemlich", "wirklich", "sehr", "viel", "total", "absolut",
"vollständig",
"häufig", "manchmal"}:
return {tok.i: ("replace", "nie")}
if tok.text.lower() in {"nie", "niemals", "weder", "nicht"}:
return {tok.i: ("replace", "")}
return {token_idx: ("replace", "nie")}
if token_text_lower in {"nie", "niemals", "weder", "nicht"}:
return {token_idx: ("replace", "")}
result = {}
if len(result) > 0:
return result
Expand All @@ -146,20 +152,20 @@ def get_change_de(doc) -> dict:
# if we had time: add functionality to handle german word order using Spacy


def get_change_fr(doc) -> dict:
def get_change_fr(token_texts_lower: list) -> dict:
"""
# Team Cheemu: Identify how to change a French sentence from positive to negative or vice versa.
:param doc:
:return:
"""
for tok in doc:
if tok.text.lower() in {"toujours", "assez", "vraiment", "très", "beaucoup de", "totalement", "absolumment",
for token_idx, token_text_lower in enumerate(token_texts_lower):
if token_text_lower in {"toujours", "assez", "vraiment", "très", "beaucoup de", "totalement", "absolumment",
"complètement", "plus", "trop de", "plein de",
"souvent", "de temps en temps"}:
return {tok.i: ("replace", "nie")}
if tok.text.lower() in {"personne", "jamais", "ni", "rien", "pas", "non", "ne", "n'", "nulle", "aucun",
return {token_idx: ("replace", "nie")}
if token_text_lower in {"personne", "jamais", "ni", "rien", "pas", "non", "ne", "n'", "nulle", "aucun",
"aucune", "guère"}:
return {tok.i: ("replace", "")}
return {token_idx: ("replace", "")}
result = {}
if len(result) > 0:
return result
Expand All @@ -175,31 +181,51 @@ def negate(text: str, language: str) -> str:
"en" for English, "pt" for Portuguese, "es" for Spanish, "it" for Italian, "de" for German, "fr" for French.
:return: the sentence negated
"""
doc = nlp(text)
tokens = tokenise(text)
token_texts = [token.group() for token in tokens]
token_texts_lower = [token.group().lower() for token in tokens]

if language == "pt":
changes = get_change_pt(doc)
changes = get_change_pt(token_texts_lower)
elif language == "es":
changes = get_change_es(doc)
changes = get_change_es(token_texts_lower)
elif language == "it":
changes = get_change_it(doc)
changes = get_change_it(token_texts_lower)
elif language == "fr":
changes = get_change_fr(doc)
changes = get_change_fr(token_texts_lower)
elif language == "de":
changes = get_change_de(doc)
changes = get_change_de(token_texts_lower)
else:
changes = get_change_en(doc)
changes = get_change_en(token_texts_lower)

text = ""
for tok in doc:
this_token_text = tok.text
if tok.i in changes:
change_operation, change_text = changes[tok.i]
for token_idx, match in reversed(list(enumerate(tokens))):
if token_idx in changes:
change_operation, change_text = changes[token_idx]
if change_operation == "replace":
this_token_text = change_text
prefix = text[:match.start()]
suffix = text[match.end():]
if prefix.endswith(" ") and suffix.startswith(" ") and change_text == "":
prefix = prefix[:-1]
text = prefix + change_text + suffix
elif change_operation == "insert_after":
this_token_text += " " + change_text
prefix = text[:match.end()]
suffix = text[match.end():]
if prefix != "" and not prefix.endswith(" "):
prefix += " "
if suffix != "" and not suffix.startswith(" "):
suffix = " " + suffix
text = prefix + change_text + suffix
elif change_operation == "insert_before":
this_token_text = change_text + " " + this_token_text
text += this_token_text + tok.whitespace_
prefix = text[:match.start()]
suffix = text[match.start():]
if prefix != "" and not prefix.endswith(" "):
prefix += " "
if suffix != "" and not suffix.startswith(" "):
suffix = " " + suffix
text = prefix + change_text + suffix
return text


if __name__ == "__main__":
text = "I never feel depressed"
print(negate(text, "en"))
4 changes: 2 additions & 2 deletions tests/test_negator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TestNegation(unittest.TestCase):
def test_simple_example(self):
text = "I never feel depressed"
print(negate(text, "en"))
self.assertEqual("I feel depressed", negate(text, "en"))
self.assertEqual("I feel depressed", negate(text, "en"))

def test_simple_example_neg(self):
text = "I feel depressed"
Expand Down Expand Up @@ -70,7 +70,7 @@ def test_simple_example_es(self):

def test_simple_example_de(self):
text = "Ich fühle mich nicht deprimiert"
self.assertEqual("Ich fühle mich deprimiert", negate(text, "de"))
self.assertEqual("Ich fühle mich deprimiert", negate(text, "de"))

def test_simple_example_de_neg(self):
text = "Ich fühle mich deprimiert"
Expand Down

0 comments on commit 1fd9b12

Please sign in to comment.