Merge pull request #247 from clulab/kwalcock/ghana

Recover the data from Ghana
clulab · Mar 21, 2024 · 53b25b8 · 53b25b8
2 parents a7df3c5 + da4922d
commit 53b25b8
Show file tree

Hide file tree

Showing 10 changed files with 659 additions and 14 deletions.
diff --git a/belief_pipeline/tpi_main.py b/belief_pipeline/tpi_main.py
@@ -20,15 +20,15 @@ def get_in_and_out() -> Tuple[str, str]:
     belief_model_name: str = "maxaalexeeva/belief-classifier_mturk_unmarked-trigger_bert-base-cased_2023-4-26-0-34"
     sentiment_model_name: str = "hriaz/finetuned_beliefs_sentiment_classifier_experiment1"
     locations_file_name: str = "./belief_pipeline/GH.tsv"    
-    input_file_name: str = "../corpora/ghana-regulations/ghana-regulations.tsv"
-    output_file_name: str = "../corpora/ghana-regulations/ghana-regulations-2.tsv"
+    input_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4.tsv"
+    output_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4a.tsv"
     # input_file_name, output_file_name = get_in_and_out()
     pipeline = Pipeline(
         TpiInputStage(input_file_name),
         [
             TpiResolutionStage(),
             TpiBeliefStage(belief_model_name),
-            # TpiSentimentStage(sentiment_model_name),
+            TpiSentimentStage(sentiment_model_name),
             TpiLocationStage(locations_file_name)
         ],
         PandasOutputStage(output_file_name)

diff --git a/belief_pipeline/vector_main.py b/belief_pipeline/vector_main.py
@@ -15,8 +15,9 @@ def get_in_and_out() -> Tuple[str, str]:
 
 if __name__ == "__main__":
     vector_model_name: str = "all-MiniLM-L6-v2"
-    input_file_name: str = "../corpora/uganda-local/uganda-2.tsv"
-    output_file_name: str = "../corpora/uganda-local/uganda-2-vectors.tsv"
+    input_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4a.tsv"
+    output_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4b.tsv"
+
     # input_file_name, output_file_name = get_in_and_out()
     pipeline = Pipeline(
         VectorInputStage(input_file_name),

diff --git a/...apps/elasticsearch/Step2InputEidos1.scala → ...s/elasticsearch/Step2InputEidos1App.scala b/...apps/elasticsearch/Step2InputEidos1.scala → ...s/elasticsearch/Step2InputEidos1App.scala
@@ -13,7 +13,7 @@ import org.json4s.jackson.JsonMethods
 import java.io.File
 import scala.util.Using
 
-object Step2InputEidos1 extends App with Logging {
+object Step2InputEidos1App extends App with Logging {
   implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats
   val contextWindow = 3
   val baseDirectory = "../corpora/uganda-local"

diff --git a/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos1GhanaApp.scala b/src/main/scala/org/clulab/habitus/apps/elasticsearch/Step2InputEidos1GhanaApp.scala
@@ -0,0 +1,144 @@
+package org.clulab.habitus.apps.elasticsearch
+
+import ai.lum.common.FileUtils._
+import org.clulab.habitus.apps.utils.{AttributeCounts, JsonRecord}
+import org.clulab.processors.{Document, Sentence}
+import org.clulab.utils.Sourcer
+import org.clulab.wm.eidos.document.AnnotatedDocument
+import org.clulab.wm.eidos.serialization.jsonld.JLDDeserializer
+import org.clulab.wm.eidoscommon.utils.{FileEditor, FileUtils, Logging, TsvReader, TsvWriter}
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods
+
+import java.io.File
+import scala.util.Using
+
+object Step2InputEidos1GhanaApp extends App with Logging {
+  implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats
+  val contextWindow = 3
+  val datasetFilename = "../corpora/ghana-elasticsearch/dataset55k.tsv"
+  val baseDirectory = "/home/kwa/data/Corpora/habitus-project/corpora/multimix"
+  val outputFileName = "../corpora/ghana-elasticsearch/ghana-elasticsearch.tsv"
+  val deserializer = new JLDDeserializer()
+
+  def getDatasetUrls(): Set[String] = {
+    // TODO: Also get terms from here instead of from directory names.
+    val datasetUrls = Using.resource(Sourcer.sourceFromFilename(datasetFilename)) { source =>
+      val tsvReader = new TsvReader()
+      val datasetUrls = source.getLines.drop(1).map { line =>
+        val Array(url) = tsvReader.readln(line, 1)
+
+        url
+      }.toSet
+
+      datasetUrls
+    }
+
+    datasetUrls
+  }
+
+  def jsonFileToJsonld(jsonFile: File): File =
+      FileEditor(jsonFile).setExt("jsonld").get
+
+  def jsonFileToRecord(jsonFile: File): JsonRecord = {
+    val json = FileUtils.getTextFromFile(jsonFile)
+    val jValue = JsonMethods.parse(json)
+    val url = (jValue \ "url").extract[String]
+    val titleOpt = (jValue \ "title").extractOpt[String]
+    val datelineOpt = (jValue \ "dateline").extractOpt[String]
+    val bylineOpt = (jValue \ "byline").extractOpt[String]
+    val text = (jValue \ "text").extract[String]
+
+    // Don't use them all in order to save space.
+    JsonRecord(url, None, None, None, "")
+  }
+
+  def jsonldFileToAnnotatedDocument(jsonldFile: File): AnnotatedDocument = {
+    val json = FileUtils.getTextFromFile(jsonldFile)
+    val corpus = deserializer.deserialize(json)
+    val annotatedDocument = corpus.head
+
+    annotatedDocument
+  }
+
+  def rawTextToCleanText(rawText: String): String = rawText
+      .trim
+      .replaceAll("\r\n", " ")
+      .replaceAll("\n", " ")
+      .replaceAll("\r", " ")
+      .replaceAll("\t", " ")
+      .replaceAll("\u2028", " ") // unicode line separator
+      .replaceAll("\u2029", " ") // unicode paragraph separator
+      .map { letter =>
+        if (letter.toInt < 32) ' '
+        else letter
+      }
+      .trim
+
+  def getSentenceText(document: Document, sentence: Sentence): String = {
+    val rawText = document.text.get.slice(sentence.startOffsets.head, sentence.endOffsets.last)
+    val cleanText = rawTextToCleanText(rawText)
+
+    cleanText
+  }
+
+  def attributeCountsToTsvWriter(attributeCounts: AttributeCounts, tsvWriter: TsvWriter): Unit = {
+    tsvWriter.print(
+      attributeCounts.increaseCount.toString, attributeCounts.decreaseCount.toString,
+      attributeCounts.posChangeCount.toString, attributeCounts.negChangeCount.toString,
+      ""
+    )
+  }
+
+  val datasetUrls: Set[String] = getDatasetUrls
+  val jsonFilesAndUrls: Seq[(File, String)] = {
+    val allJsonFiles = new File(baseDirectory).listFilesByWildcard("*.json", recursive = true).toVector
+    val jsonFilesWithJsonld = allJsonFiles.filter { jsonFile =>
+      jsonFileToJsonld(jsonFile).exists
+    }
+    val jsonFilesAndUrls: Seq[(File, String)] = jsonFilesWithJsonld.map { jsonFile =>
+      val record = jsonFileToRecord(jsonFile)
+
+      (jsonFile, record.url)
+    }
+    val headJsonFilesAndUrls = jsonFilesAndUrls.groupBy(_._2).map(_._2.head).toSeq
+
+    headJsonFilesAndUrls
+  }
+
+  Using.resource(FileUtils.printWriterFromFile(outputFileName)) { printWriter =>
+    val tsvWriter = new TsvWriter(printWriter)
+
+    tsvWriter.println("url", "sentenceIndex", "sentence", "context", "prevSentence")
+    datasetUrls.zipWithIndex.foreach { case (url, index) =>
+      val jsonFile = jsonFilesAndUrls.find(_._2 == url).get._1
+
+      println(s"$index ${jsonFile.getPath}")
+      try {
+        val jsonldFile = jsonFileToJsonld(jsonFile)
+        val annotatedDocument = jsonldFileToAnnotatedDocument(jsonldFile)
+        val document = annotatedDocument.document
+        val sentences = document.sentences
+
+        sentences.zipWithIndex.foreach { case (sentence, sentenceIndex) =>
+          val cleanText = getSentenceText(document, sentence)
+          val context = sentences
+              .slice(sentenceIndex - contextWindow, sentenceIndex + contextWindow + 1)
+              .map(getSentenceText(document, _))
+              .mkString(" ")
+          val prevSentenceText = sentences
+              .lift(sentenceIndex - 1)
+              .map(getSentenceText(document, _))
+              .getOrElse("")
+
+          tsvWriter.println(url, sentenceIndex.toString, cleanText, context, prevSentenceText)
+        }
+      }
+      catch
+      {
+        case throwable: Throwable =>
+          logger.error(s"Exception for file $jsonFile", throwable)
+      }
+    }
+  }
+}
diff --git a/...apps/elasticsearch/Step2InputEidos2.scala → ...s/elasticsearch/Step2InputEidos2App.scala b/...apps/elasticsearch/Step2InputEidos2.scala → ...s/elasticsearch/Step2InputEidos2App.scala
@@ -19,7 +19,7 @@ import java.io.File
 import java.net.URL
 import scala.util.{Try, Using}
 
-object Step2InputEidos2 extends App with Logging {
+object Step2InputEidos2App extends App with Logging {
 
   case class LocalTsvRecord(
     sentenceIndex: Int,
@@ -263,7 +263,7 @@ object Step2InputEidos2 extends App with Logging {
       val contextLocations = parseLocations(contextLocationsString)
       val vector = normalize(parseVector(vectorString))
 
-      (url, sentenceIndex) -> new LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector)
+      (url, sentenceIndex) -> LocalTsvRecord(sentenceIndex, sentence, belief, sentimentScoreOpt, sentenceLocations, contextLocations, vector)
     }.toMap
   }
   val restClient = Elasticsearch.mkRestClient(url, credentialsFilename)