From 0247a72c46b38392aaa87df3202c0acd2e1e657c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Sat, 8 Oct 2022 21:20:47 +0200
Subject: [PATCH 1/3] Check 4 colums in coqui format

---
 TTS/tts/datasets/formatters.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 8b3603f4b8..ca43dcddbb 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -15,6 +15,14 @@
 
 def coqui(root_path, meta_file, ignored_speakers=None):
     """Interal dataset formatter."""
+    filepath = os.path.join(root_path, meta_file)
+    # ensure there are 4 columns for every line
+    with open(filepath, "r") as f:
+        lines = f.readlines()
+    for idx, line in enumerate(lines):
+        if len(line.split("|")) != 4:
+            print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+    # load metadata
     metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
     assert all(x in metadata.columns for x in ["audio_file", "text"])
     speaker_name = None if "speaker_name" in metadata.columns else "coqui"

From 29f76f138e1f2589d4f03fa62e0b8a3d409f2fc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Sat, 8 Oct 2022 21:28:23 +0200
Subject: [PATCH 2/3] Fix encoding

---
 TTS/tts/datasets/formatters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ca43dcddbb..1acced9d1f 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -17,7 +17,7 @@ def coqui(root_path, meta_file, ignored_speakers=None):
     """Interal dataset formatter."""
     filepath = os.path.join(root_path, meta_file)
     # ensure there are 4 columns for every line
-    with open(filepath, "r") as f:
+    with open(filepath, "r", encoding="utf8") as f:
         lines = f.readlines()
     for idx, line in enumerate(lines):
         if len(line.split("|")) != 4:

From 7399e417bb37f43da9e7651a390ab67b787d935d Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 10 Oct 2022 10:57:53 +0200
Subject: [PATCH 3/3] Fixup

---
 TTS/tts/datasets/formatters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 1acced9d1f..f15ef96e8d 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -19,8 +19,9 @@ def coqui(root_path, meta_file, ignored_speakers=None):
     # ensure there are 4 columns for every line
     with open(filepath, "r", encoding="utf8") as f:
         lines = f.readlines()
-    for idx, line in enumerate(lines):
-        if len(line.split("|")) != 4:
+    num_cols = len(lines[0].split("|"))  # take the first row as reference
+    for idx, line in enumerate(lines[1:]):
+        if len(line.split("|")) != num_cols:
             print(f" > Missing column in line {idx + 1} -> {line.strip()}")
     # load metadata
     metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")