Feat: Visualization tool (#107)

* add stair_case graph to show number of words vs. delays (ms). Connected visualize.py via argparse. Now user may add --visualize in the terminal to have the graph output to the output folder * add visualize.py * add a buffer in front of audio file to show delay * modify visual.ipynb to include both staricase and waveform graph in 1 .png file. Add ability to read multiple dictionaries from instances.log * add ability to run --score-only with --visualize * add unit test for visualize and update .gitignore * untrack the python notebook used for prototyping * auto-generates output/visual directory when visual folder is not created * used black . to format everything * edit according to Xutai's suggestions * add visualize unit test to git workflow * fix black formatting * fix remaining file * come on black * black is weird. removed white space * add install matplotlib to workflow * idk man black is not blacking * replace ... with pass * pip==24.0 * returned to ... for dataloader * nvm pass is better for both python 3.7 and 3.8 black formatter * check for empty config.yaml file, if empty, system exits * change whisper to openai-whisper * using only python=3.8 * add editdistance for pip install in setup.py * remove creating an output directory * fix path issue * add matplotlib for pip * put 3.7 and 3.8 for python version * correct error on matplotlib * change to only 3.8 for github workflow * moved whisper dependencies to main.yml * add speech_to_text documentation * formatting changes * add line space to correct formatting * add nltk download
facebookresearch · Aug 14, 2024 · 7b45f68 · 7b45f68
1 parent bcb6d85
commit 7b45f68
Show file tree

Hide file tree

Showing 10 changed files with 442 additions and 4 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8]
+        python-version: [3.8]
 
     steps:
       - uses: actions/checkout@v2
@@ -32,8 +32,10 @@ jobs:
           pip install huggingface-hub
           pip install fairseq
           pip install sentencepiece
+          pip install openai-whisper editdistance
           pip install -e .
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          python -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')"
       - name: Lint with black
         run: black --check --diff .
       - name: Lint with flake8
@@ -49,3 +51,4 @@ jobs:
           pytest simuleval/test/test_evaluator.py
           pytest simuleval/test/test_remote_evaluation.py
           pytest simuleval/test/test_s2s.py
+          pytest simuleval/test/test_visualize.py
diff --git a/docs/tutorials/speech_to_text.rst b/docs/tutorials/speech_to_text.rst
@@ -1,2 +1,31 @@
 Speech-to-Text
-==============
+==============
+
+Whisper Agent
+-----------------
+Use whisper to evaluate custom audio for speech to text transcription.
+First, change directory to :code:`speech_to_text`:
+
+.. code-block:: bash
+
+    cd examples/speech-to-text
+
+Then, run the example code:
+
+.. code-block:: bash
+
+    simuleval \
+        --agent whisper_waitk.py \
+        --source-segment-size 500 \
+        --waitk-lagging 3 \
+        --source source.txt --target reference/transcript.txt \
+        --output output --quality-metrics WER --visualize
+
+The optional :code:`--visualize` tag generates N number of graphs in speech_to_text/output/visual directory where N corresponds to the number of source audio provided. An example graph can be seen `here <https://github.com/facebookresearch/SimulEval/pull/107>`_.
+
+|
+In addition, it supports the :code:`--score-only` command, where it will read data from :code:`instances.log` without running inference, which saves time if you just want the scores.
+
+.. code-block:: bash
+    
+    simuleval --score-only --output output --visualize
diff --git a/examples/speech_to_text/counter_in_tgt_lang_agent.py b/examples/speech_to_text/counter_in_tgt_lang_agent.py
@@ -27,6 +27,7 @@ def policy(self, states: Optional[AgentStates] = None):
             length_in_seconds = 0
         else:
             length_in_seconds = round(len(states.source) / states.source_sample_rate)
+
         if not states.source_finished and length_in_seconds < self.wait_seconds:
             return ReadAction()
 

diff --git a/examples/speech_to_text/reference/transcript.txt b/examples/speech_to_text/reference/transcript.txt
@@ -1 +1 @@
-This is a synthesized audio file to test your simultaneous speech to text and to speech to speach translation system.
+This is a synthesized audio file to test your simultaneous speech to text and to speech to speach translation system.
diff --git a/setup.py b/setup.py
@@ -40,6 +40,7 @@
         "bitarray==2.6.0",
         "yt-dlp",
         "pydub",
+        "matplotlib",
     ],
     classifiers=[
         "Programming Language :: Python :: 3",

diff --git a/simuleval/cli.py b/simuleval/cli.py
@@ -54,6 +54,7 @@ def main():
 
     # build evaluator
     evaluator = build_evaluator(args)
+
     # evaluate system
     evaluator(system)
 
@@ -78,6 +79,7 @@ def evaluate(
 
     # build evaluator
     evaluator = build_evaluator(args)
+
     # evaluate system
     evaluator(system)
 

diff --git a/simuleval/evaluator/evaluator.py b/simuleval/evaluator/evaluator.py
@@ -23,6 +23,7 @@
 from .scorers import get_scorer_class
 from .scorers.latency_scorer import LatencyScorer
 from .scorers.quality_scorer import QualityScorer
+from ..utils.visualize import Visualize
 
 try:
     import sentencepiece
@@ -83,6 +84,7 @@ def __init__(
         self.source_segment_size = getattr(args, "source_segment_size", 1)
         self.source_type = getattr(args, "source_type", None)
         self.target_type = getattr(args, "target_type", None)
+        self.visualize = args.visualize
 
         self.target_spm_model = None
         if args.eval_latency_unit == "spm":
@@ -109,7 +111,7 @@ def __init__(
             os.makedirs(self.output, exist_ok=True)
             with open(self.output / "config.yaml", "w") as f:
                 yaml.dump(
-                    {"source_type": self.source_type, "target_type": self.source_type},
+                    {"source_type": self.source_type, "target_type": self.target_type},
                     f,
                     default_flow_style=False,
                 )
@@ -220,6 +222,8 @@ def results(self):
             new_scores[name] = [value]
 
         df = pandas.DataFrame(new_scores)
+        if self.output and self.visualize:
+            self.make_visual()
         return df
 
     def dump_results(self) -> None:
@@ -241,6 +245,17 @@ def is_finished(self, instance) -> bool:
             return instance.source_finished_reading
         return instance.finish_prediction
 
+    def make_visual(self):
+        with open(self.output / "instances.log", "r") as file:
+            for line in file:
+                # Load data & index
+                data = json.loads(line)
+                index = data.get("index", 0)
+
+                # Create object & graph
+                visualize = Visualize(data, index, self.output)
+                visualize.make_graph()
+
     def __call__(self, system):
         with (
             open(self.output / "instances.log", "a")
@@ -276,6 +291,8 @@ def __call__(self, system):
         if not self.no_scoring:
             self.dump_results()
             self.dump_metrics()
+        if self.output and self.visualize:
+            self.make_visual()
 
     @classmethod
     def from_args(cls, args):

diff --git a/simuleval/options.py b/simuleval/options.py
@@ -235,6 +235,12 @@ def general_parser(
     dtype_arg_group.add_argument(
         "--fp16", action="store_true", default=False, help="Use fp16."
     )
+    parser.add_argument(
+        "--visualize",
+        action="store_true",
+        default=False,
+        help="Create visualization graphs",
+    )
 
     return parser
 

diff --git a/simuleval/test/test_visualize.py b/simuleval/test/test_visualize.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import tempfile
+from pathlib import Path
+import simuleval.cli as cli
+import shutil
+import json
+
+ROOT_PATH = Path(__file__).parents[2]
+
+
+def test_visualize(root_path=ROOT_PATH):
+    args_path = Path.joinpath(root_path, "examples", "speech_to_text")
+    os.chdir(args_path)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        cli.sys.argv[1:] = [
+            "--agent",
+            os.path.join(root_path, "examples", "speech_to_text", "whisper_waitk.py"),
+            "--source-segment-size",
+            "500",
+            "--waitk-lagging",
+            "3",
+            "--source",
+            os.path.join(root_path, "examples", "speech_to_text", "source.txt"),
+            "--target",
+            os.path.join(
+                root_path, "examples", "speech_to_text", "reference/transcript.txt"
+            ),
+            "--output",
+            "output",
+            "--quality-metrics",
+            "WER",
+            "--visualize",
+        ]
+        cli.main()
+
+        visual_folder_path = os.path.join("output", "visual")
+        source_path = os.path.join(
+            root_path, "examples", "speech_to_text", "source.txt"
+        )
+        source_length = 0
+
+        with open(source_path, "r") as f:
+            source_length = len(f.readlines())
+        images = list(Path(visual_folder_path).glob("*.png"))
+        assert len(images) == source_length
+        shutil.rmtree("output")
+
+
+def test_visualize_score_only(root_path=ROOT_PATH):
+    args_path = Path.joinpath(root_path, "examples", "speech_to_text")
+    os.chdir(args_path)
+
+    # Create sample instances.log and config.yaml in output directory
+    output = Path("output")
+    output.mkdir()
+    os.chdir(output)
+    with open("config.yaml", "w") as config:
+        config.write("source_type: speech\n")
+        config.write("target_type: speech")
+    with open("instances.log", "w") as instances:
+        json.dump(
+            {
+                "index": 0,
+                "prediction": "This is a synthesized audio file to test your simultaneous speech, to speak to speech, to speak translation system.",
+                "delays": [
+                    1500.0,
+                    2000.0,
+                    2500.0,
+                    3000.0,
+                    3500.0,
+                    4000.0,
+                    4500.0,
+                    5000.0,
+                    5500.0,
+                    6000.0,
+                    6500.0,
+                    6849.886621315192,
+                    6849.886621315192,
+                    6849.886621315192,
+                    6849.886621315192,
+                    6849.886621315192,
+                    6849.886621315192,
+                    6849.886621315192,
+                    6849.886621315192,
+                ],
+                "elapsed": [
+                    1947.3278522491455,
+                    2592.338800430298,
+                    3256.8109035491943,
+                    3900.0539779663086,
+                    4561.986684799194,
+                    5216.205835342407,
+                    5874.6888637542725,
+                    6526.906728744507,
+                    7193.655729293823,
+                    7852.792739868164,
+                    8539.628744125366,
+                    9043.279374916267,
+                    9043.279374916267,
+                    9043.279374916267,
+                    9043.279374916267,
+                    9043.279374916267,
+                    9043.279374916267,
+                    9043.279374916267,
+                    9043.279374916267,
+                ],
+                "prediction_length": 19,
+                "reference": "This is a synthesized audio file to test your simultaneous speech to text and to speech to speach translation system.",
+                "source": [
+                    "test.wav",
+                    "samplerate: 22050 Hz",
+                    "channels: 1",
+                    "duration: 6.850 s",
+                    "format: WAV (Microsoft) [WAV]",
+                    "subtype: Signed 16 bit PCM [PCM_16]",
+                ],
+                "source_length": 6849.886621315192,
+            },
+            instances,
+        )
+
+    os.chdir(args_path)
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        cli.sys.argv[1:] = ["--score-only", "--output", "output", "--visualize"]
+        cli.main()
+
+        visual_folder_path = os.path.join("output", "visual")
+        source_path = os.path.join(
+            root_path, "examples", "speech_to_text", "source.txt"
+        )
+        source_length = 0
+
+        with open(source_path, "r") as f:
+            source_length = len(f.readlines())
+        images = list(Path(visual_folder_path).glob("*.png"))
+        assert len(images) == source_length
+        shutil.rmtree("output")
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		This is a synthesized audio file to test your simultaneous speech to text and to speech to speach translation system.
		This is a synthesized audio file to test your simultaneous speech to text and to speech to speach translation system.