From 78e923b717f25c95d839c596789df55ed2a76d9c Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 9 Mar 2021 13:57:31 +0000
Subject: [PATCH 01/49] DataStore: small fix in SQL query, add to-do comment

---
 nanocompore/DataStore.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index cd3e9cb..010edba 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -43,11 +43,12 @@ class DataStore(object):
                           "FOREIGN KEY(readid) REFERENCES reads(id)"
                           ")"
                           )
+    # TODO: 'sequence' is stored redundantly - move it to a separate table
 
 
     create_samples_query = ("CREATE TABLE IF NOT EXISTS samples ("
                             "id INTEGER NOT NULL PRIMARY KEY,"
-                            "name VARCHAR NOT NULL UNIQUE,"
+                            "name VARCHAR NOT NULL UNIQUE"
                             ")"
                             )
 
@@ -59,7 +60,6 @@ class DataStore(object):
                                 )
 
 
-
     def __init__(self, db_path:str):
         self.__db_path=db_path
         db_is_new = not os.path.exists(self.__db_path)

From e5d7621d95e45415a84f3fd18a8849c93d1de725 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 9 Mar 2021 14:00:27 +0000
Subject: [PATCH 02/49] Eventalign_collapse: add flag for writing to DB (or TSV
 file), fix error in DataStore call ('add_read' renamed to 'store_read')

---
 nanocompore/Eventalign_collapse.py | 35 +++++++++++++++++-------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index c273fe6..39f1251 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -32,15 +32,16 @@
 #~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~#
 class Eventalign_collapse ():
 
-    def __init__ (self,
-        eventalign_fn:str,
-        sample_name:str,
-        outpath:str="./",
-        outprefix:str="out",
-        overwrite:bool = False,
-        n_lines:int=None,
-        nthreads:int = 3,
-        progress:bool = False):
+    def __init__(self,
+                 eventalign_fn:str,
+                 sample_name:str,
+                 outpath:str="./",
+                 outprefix:str="out",
+                 write_db:bool = True,
+                 overwrite:bool = False,
+                 n_lines:int=None,
+                 nthreads:int = 3,
+                 progress:bool = False):
         """
         Collapse the nanopolish eventalign events at kmer level
         * eventalign_fn
@@ -51,6 +52,8 @@ def __init__ (self,
             Path to the output folder (will be created if it does exist yet)
         * outprefix
             text outprefix for all the files generated
+        * write_db
+            Write output to database? (Otherwise to TSV file.)
         * overwrite
             If the output directory already exists, the standard behaviour is to raise an error to prevent overwriting existing data
             This option ignore the error and overwrite data if they have the same outpath and outprefix.
@@ -74,6 +77,7 @@ def __init__ (self,
         self.__sample_name = sample_name
         self.__outpath = outpath
         self.__outprefix = outprefix
+        self.__write_db = write_db
         self.__eventalign_fn = eventalign_fn
         self.__n_lines = n_lines
         self.__nthreads = nthreads - 2 # subtract 1 for reading and 1 for writing
@@ -99,10 +103,11 @@ def __call__(self):
         ps_list.append (mp.Process (target=self.__split_reads, args=(in_q, error_q)))
         for i in range (self.__nthreads):
             ps_list.append (mp.Process (target=self.__process_read, args=(in_q, out_q, error_q)))
-        ps_list.append (mp.Process (target=self.__write_output_to_db, args=(out_q, error_q)))
-
-
-        # TODO: Check that sample_name does not exist already in DB
+        if self.__write_db:
+            ps_list.append (mp.Process (target=self.__write_output_to_db, args=(out_q, error_q)))
+            # TODO: Check that sample_name does not exist already in DB
+        else:
+            ps_list.append(mp.Process(target=self.__write_output, args=(out_q, error_q)))
 
         # Start processes and monitor error queue
         try:
@@ -233,13 +238,13 @@ def __write_output_to_db (self, out_q, error_q):
 
         n_reads = 0
         try:
-            with DataStore(db_path=os.path.join(self.__outpath, self.__outprefix+"nanocompore.db")) as datastore, tqdm (unit=" reads") as pbar:
+            with DataStore(db_path=os.path.join(self.__outpath, self.__outprefix+"_nanocompore.db")) as datastore, tqdm (unit=" reads") as pbar:
                 # Iterate over out queue until nthread poison pills are found
                 for _ in range (self.__nthreads):
                     for read in iter (out_q.get, None):
                         logger.debug(f"Written {read.read_id}")
                         n_reads+=1
-                        datastore.add_read(read)
+                        datastore.store_read(read)
                         pbar.update(1)
         except Exception:
             logger.error("Error adding read to DB")

From d7edde9407f791af444f86b5b201cdbb039c4df9 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 10 Mar 2021 14:02:25 +0000
Subject: [PATCH 03/49] better log message in 'DataStore.__init__'

---
 nanocompore/DataStore.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 63e5860..0d527ee 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -63,7 +63,10 @@ class DataStore(object):
     def __init__(self, db_path:str):
         self.__db_path=db_path
         db_is_new = not os.path.exists(self.__db_path)
-        logger.debug(f"DB file doesn't exist: {db_is_new}")
+        if db_is_new:
+            logger.info("Creating new database")
+        else:
+            logger.info("Using existing database")
         if db_is_new: self.__init_db()
 
     def __enter__(self):
@@ -77,7 +80,7 @@ def __exit__(self,exc_type, exc_value, traceback):
     def __open_db_connection(self):
         try:
             logger.debug("Connecting to DB")
-            self.__connection = lite.connect(self.__db_path);
+            self.__connection = lite.connect(self.__db_path)
             self.__cursor = self.__connection.cursor()
         except:
             logger.error("Error connecting to database")

From afe005c04739ba3b0f59eb3a32127a1cec3f55c7 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Thu, 11 Mar 2021 17:42:18 +0000
Subject: [PATCH 04/49] coding style (added spaces for readability)

---
 nanocompore/Eventalign_collapse.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index 3bd570f..826f941 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -246,12 +246,12 @@ def __write_output_to_db (self, out_q, error_q):
                 # Iterate over out queue until nthread poison pills are found
                 for _ in range (self.__nthreads):
                     for read in iter (out_q.get, None):
-                        n_reads+=1
+                        n_reads += 1
                         datastore.store_read(read)
                         pbar.update(1)
         except Exception:
             logger.error("Error adding read to DB")
-            error_q.put (NanocomporeError(traceback.format_exc()))
+            error_q.put(NanocomporeError(traceback.format_exc()))
 
         finally:
             logger.info ("Output reads written:{}".format(n_reads))
@@ -301,10 +301,10 @@ def __write_output (self, out_q, error_q):
 
                         # Write kmer data matching data field order
                         for kmer in kmer_res_l:
-                            n_kmers+=1
-                            data_str = "\t".join([str(kmer[f]) for f in data_header_list])+"\n"
+                            n_kmers += 1
+                            data_str = "\t".join([str(kmer[f]) for f in data_header_list]) + "\n"
                             data_fp.write(data_str)
-                            byte_len+=len(data_str)
+                            byte_len += len(data_str)
 
                         # Add byte
                         read_res_d["byte_offset"] = byte_offset
@@ -313,7 +313,7 @@ def __write_output (self, out_q, error_q):
                         idx_fp.write("{}\n".format(idx_str))
 
                         # Update pbar
-                        byte_offset+=byte_len
+                        byte_offset += byte_len
                         pbar.update(1)
 
                 # Flag last line
@@ -380,13 +380,13 @@ def add_event (self, event_d):
     @property
     def kmers_status (self):
         d = OrderedDict()
-        d["kmers"] = self.ref_end-self.ref_start+1
+        d["kmers"] = self.ref_end - self.ref_start + 1
         d["missing_kmers"] = d["kmers"] - len(self.kmer_l)
-        d["NNNNN_kmers"]=0
-        d["mismatch_kmers"]=0
-        d["valid_kmers"]=0
+        d["NNNNN_kmers"] = 0
+        d["mismatch_kmers"] = 0
+        d["valid_kmers"] = 0
         for k in self.kmer_l:
-            d[k.status+"_kmers"]+=1
+            d[k.status + "_kmers"] += 1
         return d
 
     def get_read_results (self):
@@ -469,7 +469,5 @@ def get_results(self):
         d["mismatch_dwell_time"] = self.mismatch_dwell_time
         d["status"] = self.status
         d["median"] = statistics.median(self.sample_list)
-        d["mad"] = statistics.median([ abs( i-d["median"] ) for i in self.sample_list])
-
+        d["mad"] = statistics.median([abs(i - d["median"]) for i in self.sample_list])
         return d
-

From 7f60da1c0c2f32f3e91d238a312782658a175293 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Thu, 11 Mar 2021 18:25:38 +0000
Subject: [PATCH 05/49] store read-level kmer stats in database (needed for
 whitelisting)

---
 nanocompore/DataStore.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 0d527ee..9738aff 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -22,9 +22,13 @@ class DataStore(object):
                           "numevents INT NOT NULL,"
                           "numsignals INT NOT NULL,"
                           "dwelltime REAL NOT NULL,"
+                          "kmers INT NOT NULL,"
+                          "missing_kmers INT NOT NULL,"
+                          "NNNNN_kmers INT NOT NULL,"
+                          "mismatch_kmers INT NOT NULL,"
+                          "valid_kmers INT NOT NULL,"
                           "FOREIGN KEY(sampleid) REFERENCES samples(id)"
-                          "FOREIGN KEY(transcriptid) REFERENCES transcripts(id),"
-                          "UNIQUE(id, name)"
+                          "FOREIGN KEY(transcriptid) REFERENCES transcripts(id)"
                           ")"
                           )
 
@@ -119,10 +123,11 @@ def store_read(self, read):
         """
         tx_id = self.get_transcript_id_by_name(read.ref_id, create_if_not_exists=True)
         sample_id = self.get_sample_id_by_name(read.sample_name, create_if_not_exists=True)
+        values = (read.read_id, sample_id, tx_id, read.ref_start, read.ref_end,
+                  read.n_events, read.n_signals, read.dwell_time) + tuple(read.kmers_status.values())
         try:
-            self.__cursor.execute("INSERT INTO reads VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?)",
-                                    (read.read_id, sample_id, tx_id, read.ref_start, read.ref_end,
-                                     read.n_events, read.n_signals, read.dwell_time))
+            self.__cursor.execute("INSERT INTO reads VALUES(NULL" + ", ?" * len(values) + ")",
+                                  values)
             read_id = self.__cursor.lastrowid
         except Exception:
             logger.error("Error inserting read into DB")

From da7eb4b8bcd5b2968d575b6306ec2f2f4bbe0ab7 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 12 Mar 2021 19:18:59 +0000
Subject: [PATCH 06/49] Whitelist: read data from SQLite, filter reads during
 query

---
 nanocompore/Whitelist.py | 241 ++++++++++++++++++---------------------
 1 file changed, 114 insertions(+), 127 deletions(-)

diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index d666e30..b3e0f78 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -6,6 +6,7 @@
 import logging
 from loguru import logger
 import random
+import sqlite3
 
 # Third party
 import numpy as np
@@ -24,22 +25,26 @@ class Whitelist(object):
 
     #~~~~~~~~~~~~~~MAGIC METHODS~~~~~~~~~~~~~~#
     def __init__(self,
-        eventalign_fn_dict,
-        fasta_fn,
-        min_coverage = 10,
-        min_ref_length = 100,
-        downsample_high_coverage = False,
-        max_invalid_kmers_freq = 0.1,
-        max_NNNNN_freq = 0.1,
-        max_mismatching_freq = 0.1,
-        max_missing_freq = 0.1,
-        select_ref_id = [],
-        exclude_ref_id = []):
+                 db_path,
+                 sample_dict,
+                 fasta_fn,
+                 min_coverage = 10,
+                 min_ref_length = 100,
+                 downsample_high_coverage = False,
+                 max_invalid_kmers_freq = 0.1,
+                 max_NNNNN_freq = 0.1,
+                 max_mismatching_freq = 0.1,
+                 max_missing_freq = 0.1,
+                 select_ref_id = [],
+                 exclude_ref_id = []):
         """
-        #########################################################
-        * eventalign_fn_dict
-            Multilevel dictionnary indicating the condition_label, sample_label and file name of the eventalign_collapse output
-            example d = {"S1": {"R1":"path1.tsv", "R2":"path2.tsv"}, "S2": {"R1":"path3.tsv", "R2":"path4.tsv"}}
+        Generate a whitelist of reads that fulfill filtering criteria
+        Args:
+        * db_path
+            Path to the SQLite database file with event-aligned read/kmer data
+        * sample_dict
+            Dictionary containing lists of (unique) sample names, grouped by condition
+            example d = {"control": ["C1", "C2"], "treatment": ["T1", "T2"]}
         * fasta_fn
             Path to a fasta file corresponding to the reference used for read alignemnt
         * min_coverage
@@ -63,53 +68,91 @@ def __init__(self,
             if given, refid in the list will be excluded from the analysis
         """
 
-        # Check index files
-        self.__filter_invalid_kmers = True
-        for sample_dict in eventalign_fn_dict.values():
-            for fn in sample_dict.values():
-                idx_fn = fn+".idx"
-                if not access_file(idx_fn):
-                    raise NanocomporeError("Cannot access eventalign_collapse index file {}".format(idx_fn))
-                # Check header line and set a flag to skip filter if the index file does not contain kmer status information
-                with open(idx_fn, "r") as fp:
-                    header = fp.readline().rstrip().split("\t")
-                if not all_values_in (("ref_id", "read_id", "byte_offset", "byte_len"), header):
-                    raise NanocomporeError("The index file {} does not contain the require header fields".format(idx_fn))
-                if not all_values_in (("kmers", "NNNNN_kmers", "mismatch_kmers", "missing_kmers"), header):
-                    self.__filter_invalid_kmers = False
-                    logger.debug("Invalid kmer information not available in index file")
-
-        self.__eventalign_fn_dict = eventalign_fn_dict
-
-        # Get number of samples
-        n = 0
-        for sample_dict in self.__eventalign_fn_dict.values():
-            for sample_lab in sample_dict.keys():
-                n+=1
-        self.__n_samples = n
-
-        # Test is Fasta can be opened
+        # Check that sample names are unique and create look-up dict. of conditions
+        cond_dict = {}
+        for cond, samples in sample_dict.items():
+            for sample in samples:
+                if sample in cond_dict:
+                    raise NanocomporeError(f"Sample name '{sample}' is not unique")
+                cond_dict[sample] = cond
+
+        # Get sample names and IDs from DB
+        self.__db_path = db_path
+        self.__open_db_connection()
+        db_samples = {}
+        try:
+            self.__cursor.execute("SELECT * FROM samples")
+            for row in self.__cursor:
+                db_samples[row["id"]] = row["name"]
+        except Exception:
+            logger.error("Error reading sample names from DB")
+            raise Exception
+        # Check that requested samples are in DB
+        for samples in sample_dict.values():
+            for sample in samples:
+                if sample not in db_samples.values():
+                    raise NanocomporeError(f"Sample '{sample}' not present in DB")
+
+        # Test if Fasta can be opened
         try:
             with Fasta(fasta_fn):
                 self._fasta_fn = fasta_fn
         except IOError:
             raise NanocomporeError("The fasta file cannot be opened")
 
-        # Create reference index for both files
-        logger.info("Reading eventalign index files")
-        ref_reads = self.__read_eventalign_index(
-            eventalign_fn_dict = eventalign_fn_dict,
-            max_invalid_kmers_freq = max_invalid_kmers_freq,
-            max_NNNNN_freq = max_NNNNN_freq,
-            max_mismatching_freq = max_mismatching_freq,
-            max_missing_freq = max_missing_freq,
-            select_ref_id = select_ref_id,
-            exclude_ref_id = exclude_ref_id)
+        # Set up filters by adding conditions for DB query
+        select = ["reads.id AS readid", "sampleid", "transcriptid", "transcripts.name AS transcriptname"]
+        where = []
+        # Get reads only from a subset of samples?
+        if len(cond_dict) != len(db_samples):
+            where = ["sampleid IN (%s)" % ", ".join(map(str, db_samples))]
+
+        if select_ref_id:
+            select.append("reads.name AS readname")
+            where.append("readname IN ('%s')" % "', '".join(select_ref_id))
+        elif exclude_ref_id:
+            select.append("reads.name AS readname")
+            where.append("readname NOT IN ('%s')" % "', '".join(exclude_ref_id))
+
+        if max_invalid_kmers_freq is not None:
+            if max_invalid_kmers_freq < 1.0:
+                select.append("1.0 - CAST(valid_kmers AS REAL) / kmers AS invalid_freq")
+                where.append(f"invalid_freq <= {max_invalid_kmers_freq}")
+        else:
+            if max_NNNNN_freq < 1.0:
+                select.append("CAST(NNNNN_kmers AS REAL) / kmers AS NNNNN_freq")
+                where.append(f"NNNNN_freq <= {max_NNNNN_freq}")
+            if max_mismatching_freq < 1.0:
+                select.append("CAST(mismatch_kmers AS REAL) / kmers AS mismatch_freq")
+                where.append(f"mismatch_freq <= {max_mismatching_freq}")
+            if max_missing_freq < 1.0:
+                select.append("CAST(missing_kmers AS REAL) / kmers AS missing_freq")
+                where.append(f"missing_freq <= {max_missing_freq}")
+
+        query = "SELECT %s FROM reads LEFT JOIN transcripts ON transcriptid = transcripts.id" % \
+            ", ".join(select)
+        if where:
+            query += " WHERE %s" % " AND ".join(where)
+
+        # dict. structure: transcript -> condition -> sample -> list of reads
+        ref_reads = {}
+        logger.info("Querying reads from DB")
+        try:
+            self.__cursor.execute(query)
+            for row in self.__cursor:
+                read_id = row["readid"]
+                sample_id = row["sampleid"]
+                condition = cond_dict[db_samples[sample_id]]
+                ref_id = row["transcriptname"]
+                ref_reads.setdefault(ref_id, {}).setdefault(condition, {}).setdefault(sample_id, []).append(read_id)
+        except Exception:
+            logger.error("Error querying reads from DB")
+            raise Exception
 
         # Filtering at transcript level
         logger.info("Filtering out references with low coverage")
         self.ref_reads = self.__select_ref(
-            ref_reads = ref_reads,
+            ref_reads=ref_reads,
             min_coverage=min_coverage,
             min_ref_length=min_ref_length,
             downsample_high_coverage=downsample_high_coverage)
@@ -119,6 +162,7 @@ def __init__(self,
         self.__downsample_high_coverage = downsample_high_coverage
         self.__max_invalid_kmers_freq = max_invalid_kmers_freq
 
+
     def __repr__(self):
         return "Whitelist: Number of references: {}".format(len(self))
 
@@ -152,81 +196,24 @@ def ref_id_list(self):
         return list(self.ref_reads.keys())
 
     #~~~~~~~~~~~~~~PRIVATE METHODS~~~~~~~~~~~~~~#
-    def __read_eventalign_index(self,
-        eventalign_fn_dict,
-        max_invalid_kmers_freq,
-        max_NNNNN_freq,
-        max_mismatching_freq,
-        max_missing_freq,
-        select_ref_id,
-        exclude_ref_id):
-        """Read the 2 index files and sort by sample and ref_id in a multi level dict"""
-
-        ref_reads = OrderedDict()
-
-        for cond_lab, sample_dict in eventalign_fn_dict.items():
-            for sample_lab, fn in sample_dict.items():
-                idx_fn = fn+".idx"
-                with open(idx_fn) as fp:
-
-                    # Get column names from header
-                    col_names = fp.readline().rstrip().split()
-                    c = Counter()
-                    for line in fp:
-                        try:
-                            # Transform line to dict and cast str numbers to actual numbers
-                            read = numeric_cast_dict(keys=col_names, values=line.rstrip().split("\t"))
-
-                            # Filter out ref_id if a select_ref_id list or exclude_ref_id list was provided
-                            if select_ref_id and not read["ref_id"] in select_ref_id:
-                                raise NanocomporeError("Ref_id not in select list")
-                            elif exclude_ref_id and read["ref_id"] in exclude_ref_id:
-                                raise NanocomporeError("Ref_id in exclude list")
-
-                            # Filter out reads with high number of invalid kmers if information available
-                            if self.__filter_invalid_kmers:
-                                if max_invalid_kmers_freq:
-                                    invalid_kmers_freq = (read["NNNNN_kmers"]+read["mismatch_kmers"]+read["missing_kmers"])/read["kmers"]
-                                    if invalid_kmers_freq > max_invalid_kmers_freq:
-                                        raise NanocomporeError("High fraction of invalid kmers ({}%) for read {}".format(round(invalid_kmers_freq*100,2), read["read_id"]))
-                                else:
-                                    NNNNN_kmers_freq = read["NNNNN_kmers"]/read["kmers"]
-                                    max_mismatching_freq = read["mismatch_kmers"]/read["kmers"]
-                                    max_missing_freq = read["missing_kmers"]/read["kmers"]
-                                    if NNNNN_kmers_freq > max_NNNNN_freq:
-                                        raise NanocomporeError("High fraction of NNNNN kmers ({}%) for read {}".format(round(NNNNN_kmers_freq*100,2), read["read_id"]))
-                                    elif max_mismatching_freq > max_mismatching_freq:
-                                        raise NanocomporeError("High fraction of mismatching kmers ({}%) for read {}".format(round(max_mismatching_freq*100,2), read["read_id"]))
-                                    elif max_missing_freq > max_missing_freq:
-                                        raise NanocomporeError("High fraction of missing kmers ({}%) for read {}".format(round(max_missing_freq*100,2), read["read_id"]))
-
-                            # Create dict arborescence and save valid reads
-                            if not read["ref_id"] in ref_reads:
-                                ref_reads[read["ref_id"]] = OrderedDict()
-                            if not cond_lab in ref_reads[read["ref_id"]]:
-                                ref_reads[read["ref_id"]][cond_lab] = OrderedDict()
-                            if not sample_lab in ref_reads[read["ref_id"]][cond_lab]:
-                                ref_reads[read["ref_id"]][cond_lab][sample_lab] = []
-
-                            # Fill in list of reads
-                            ref_reads[read["ref_id"]][cond_lab][sample_lab].append(read)
-                            c ["valid reads"] += 1
-
-                        except NanocomporeError as E:
-                            c [str(E)] += 1
-
-                logger.debug("\tCondition:{} Sample:{} {}".format(cond_lab, sample_lab, counter_to_str(c)))
-        # Fill in missing condition/sample slots in case
-        # a ref_id is missing from one of the eventalign files
-        for ref_id in ref_reads.keys():
-            for cond_lab, sample_dict in eventalign_fn_dict.items():
-                for sample_lab in sample_dict.keys():
-                    if not cond_lab in ref_reads[ref_id]:
-                        ref_reads[ref_id][cond_lab] = OrderedDict()
-                    if not sample_lab in ref_reads[ref_id][cond_lab]:
-                        ref_reads[ref_id][cond_lab][sample_lab] = []
-        logger.info("\tReferences found in index: {}".format(len(ref_reads)))
-        return ref_reads
+
+    def __open_db_connection(self):
+        try:
+            logger.debug("Connecting to DB")
+            self.__connection = sqlite3.connect(self.__db_path)
+            self.__connection.row_factory = sqlite3.Row
+            self.__cursor = self.__connection.cursor()
+        except:
+            logger.error("Error connecting to database")
+            raise
+
+    def __close_db_connection(self):
+        if self.__connection:
+            logger.debug("Closing connection to DB")
+            self.__connection.commit()
+            self.__connection.close()
+            self.__connection = None
+            self.__cursor = None
 
     def __select_ref(self,
         ref_reads,

From 2fefe7cdef81d64861596f3facff5f9cca9dc6be Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 16 Mar 2021 19:54:26 +0000
Subject: [PATCH 07/49] add function to check validity of sample dictionary to
 'common.py'

---
 nanocompore/common.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/nanocompore/common.py b/nanocompore/common.py
index d9637ff..6d3f3ad 100644
--- a/nanocompore/common.py
+++ b/nanocompore/common.py
@@ -37,6 +37,35 @@ def build_eventalign_fn_dict(file_list1, file_list2, label1, label2):
     d[label2] = {"{}_{}".format(label2, i): v for i, v in enumerate(file_list2.split(","),1)}
     return d
 
+def check_sample_dict(sample_dict):
+    # Check general structure
+    if type(sample_dict) not in (dict, OrderedDict):
+        raise NanocomporeError(f"Expected a dictionary. Got a '{type(sample_dict)}'.")
+    if len(sample_dict) != 2:
+        raise NanocomporeError(f"Expected two conditions. Found {len(sample_dict)}.")
+    for condition, samples in sample_dict.items():
+        if type(samples) is not list:
+            raise NanocomporeError(f"Expected a list of sample names for condition '{condition}'. "
+                                   "Got a '{type(sample_dict)}'.")
+        if not samples:
+            raise NanocomporeError(f"Empty sample list for condition '{condition}'.")
+        if len(samples) == 1:
+            logger.warning(f"Only one replicate found for condition '{condition}'. "
+                           "This is not recommended. "
+                           "Statistics will be calculated using the logit method.")
+    # Check for duplicate sample names
+    for condition, samples in sample_dict.items():
+        if len(set(samples)) < len(samples):
+            raise NanocomporeError(f"Duplicate sample names for condition '{condition}'.")
+    all_samples = list(sample_dict.values()) # there must be two lists - already checked
+    if any([sample in all_samples[1] for sample in all_samples[0]]):
+        logger.warning("Found sample name shared between conditions. "
+                       "Prefixing all sample names with their condition.")
+        for condition, samples in sample_dict.items():
+            # can't modify 'samples' directly here!
+            sample_dict[condition] = [f"{condition}_{sample}" for sample in samples]
+
+
 def set_logger (log_level, log_fn=None):
     log_level = log_level.upper()
     logger.remove()

From 8f52c1c1816a43476b765ac29a45f9d8b895dda0 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 16 Mar 2021 19:56:48 +0000
Subject: [PATCH 08/49] new class 'DatabaseWrapper' for reusable DB interaction
 code; adapt 'Whitelist' accordingly

---
 nanocompore/DataStore.py       |   2 +-
 nanocompore/DatabaseWrapper.py |  61 ++++++++++++++
 nanocompore/Whitelist.py       | 143 +++++++++++++--------------------
 3 files changed, 119 insertions(+), 87 deletions(-)
 create mode 100644 nanocompore/DatabaseWrapper.py

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 9738aff..c27e779 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -77,7 +77,7 @@ def __enter__(self):
         self.__open_db_connection()
         return self
 
-    def __exit__(self,exc_type, exc_value, traceback):
+    def __exit__(self, exc_type, exc_value, traceback):
         self.__connection.commit()
         self.__close_db_connection()
 
diff --git a/nanocompore/DatabaseWrapper.py b/nanocompore/DatabaseWrapper.py
new file mode 100644
index 0000000..0b5f2a1
--- /dev/null
+++ b/nanocompore/DatabaseWrapper.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import sqlite3
+from loguru import logger
+from nanocompore.common import NanocomporeError
+
+
+class DatabaseWrapper(object):
+
+    def __init__(self, db_path):
+        self.__db_path = db_path
+        self.__connection = None
+        self.__cursor = None
+
+    def __enter__(self):
+        try:
+            logger.debug("Connecting to database")
+            self.__connection = sqlite3.connect(self.__db_path)
+            self.__connection.row_factory = sqlite3.Row
+            self.__cursor = self.__connection.cursor()
+        except:
+            logger.error("Error connecting to database")
+            raise
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.__connection:
+            logger.debug("Closing database connection")
+            self.__connection.commit()
+            self.__connection.close()
+            self.__connection = None
+            self.__cursor = None
+
+    @property
+    def cursor(self):
+        return self.__cursor
+
+    def get_samples(self, sample_dict=None):
+        if not self.__connection:
+            raise NanocomporeError("Database connection not yet opened")
+        expected_samples = []
+        if sample_dict: # query only relevant samples
+            for samples in sample_dict.values():
+                expected_samples += samples
+            if not expected_samples:
+                raise NanocomporeError("No sample names in 'sample_dict'")
+            where = " WHERE name IN ('%s')" % "', '".join(expected_samples)
+        else:
+            where = ""
+        db_samples = {}
+        try:
+            self.cursor.execute("SELECT * FROM samples" + where)
+            for row in self.cursor:
+                db_samples[row["id"]] = row["name"]
+        except Exception:
+            logger.error("Error reading sample names from database")
+            raise Exception
+        for sample in expected_samples: # check that requested samples are in DB
+            if sample not in db_samples.values():
+                raise NanocomporeError(f"Sample '{sample}' not present in database")
+        return db_samples
diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index b3e0f78..5adfbdd 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -15,6 +15,7 @@
 
 # Local package
 from nanocompore.common import *
+from nanocompore.DatabaseWrapper import DatabaseWrapper
 
 # Set global random seed
 downsample_random_seed = 42
@@ -68,31 +69,14 @@ def __init__(self,
             if given, refid in the list will be excluded from the analysis
         """
 
-        # Check that sample names are unique and create look-up dict. of conditions
+        check_sample_dict(sample_dict)
+
+        # Create look-up dict. of conditions
         cond_dict = {}
         for cond, samples in sample_dict.items():
             for sample in samples:
-                if sample in cond_dict:
-                    raise NanocomporeError(f"Sample name '{sample}' is not unique")
                 cond_dict[sample] = cond
 
-        # Get sample names and IDs from DB
-        self.__db_path = db_path
-        self.__open_db_connection()
-        db_samples = {}
-        try:
-            self.__cursor.execute("SELECT * FROM samples")
-            for row in self.__cursor:
-                db_samples[row["id"]] = row["name"]
-        except Exception:
-            logger.error("Error reading sample names from DB")
-            raise Exception
-        # Check that requested samples are in DB
-        for samples in sample_dict.values():
-            for sample in samples:
-                if sample not in db_samples.values():
-                    raise NanocomporeError(f"Sample '{sample}' not present in DB")
-
         # Test if Fasta can be opened
         try:
             with Fasta(fasta_fn):
@@ -100,54 +84,59 @@ def __init__(self,
         except IOError:
             raise NanocomporeError("The fasta file cannot be opened")
 
-        # Set up filters by adding conditions for DB query
-        select = ["reads.id AS readid", "sampleid", "transcriptid", "transcripts.name AS transcriptname"]
-        where = []
-        # Get reads only from a subset of samples?
-        if len(cond_dict) != len(db_samples):
-            where = ["sampleid IN (%s)" % ", ".join(map(str, db_samples))]
-
-        if select_ref_id:
-            select.append("reads.name AS readname")
-            where.append("readname IN ('%s')" % "', '".join(select_ref_id))
-        elif exclude_ref_id:
-            select.append("reads.name AS readname")
-            where.append("readname NOT IN ('%s')" % "', '".join(exclude_ref_id))
-
-        if max_invalid_kmers_freq is not None:
-            if max_invalid_kmers_freq < 1.0:
-                select.append("1.0 - CAST(valid_kmers AS REAL) / kmers AS invalid_freq")
-                where.append(f"invalid_freq <= {max_invalid_kmers_freq}")
-        else:
-            if max_NNNNN_freq < 1.0:
-                select.append("CAST(NNNNN_kmers AS REAL) / kmers AS NNNNN_freq")
-                where.append(f"NNNNN_freq <= {max_NNNNN_freq}")
-            if max_mismatching_freq < 1.0:
-                select.append("CAST(mismatch_kmers AS REAL) / kmers AS mismatch_freq")
-                where.append(f"mismatch_freq <= {max_mismatching_freq}")
-            if max_missing_freq < 1.0:
-                select.append("CAST(missing_kmers AS REAL) / kmers AS missing_freq")
-                where.append(f"missing_freq <= {max_missing_freq}")
-
-        query = "SELECT %s FROM reads LEFT JOIN transcripts ON transcriptid = transcripts.id" % \
-            ", ".join(select)
-        if where:
-            query += " WHERE %s" % " AND ".join(where)
-
-        # dict. structure: transcript -> condition -> sample -> list of reads
-        ref_reads = {}
-        logger.info("Querying reads from DB")
-        try:
-            self.__cursor.execute(query)
-            for row in self.__cursor:
-                read_id = row["readid"]
-                sample_id = row["sampleid"]
-                condition = cond_dict[db_samples[sample_id]]
-                ref_id = row["transcriptname"]
-                ref_reads.setdefault(ref_id, {}).setdefault(condition, {}).setdefault(sample_id, []).append(read_id)
-        except Exception:
-            logger.error("Error querying reads from DB")
-            raise Exception
+        # Database interaction
+        with DatabaseWrapper(db_path) as db:
+            db_samples = db.get_samples(sample_dict)
+
+            # Set up filters by adding conditions for DB query
+            select = ["reads.id AS readid", "sampleid", "transcriptid", "transcripts.name AS transcriptname"]
+            where = []
+            # Get reads only from a subset of samples?
+            if len(cond_dict) != len(db_samples):
+                where = ["sampleid IN (%s)" % ", ".join(map(str, db_samples))]
+
+            if select_ref_id:
+                select.append("reads.name AS readname")
+                where.append("readname IN ('%s')" % "', '".join(select_ref_id))
+            elif exclude_ref_id:
+                select.append("reads.name AS readname")
+                where.append("readname NOT IN ('%s')" % "', '".join(exclude_ref_id))
+
+            if max_invalid_kmers_freq is not None:
+                if max_invalid_kmers_freq < 1.0:
+                    select.append("1.0 - CAST(valid_kmers AS REAL) / kmers AS invalid_freq")
+                    where.append(f"invalid_freq <= {max_invalid_kmers_freq}")
+            else:
+                if max_NNNNN_freq < 1.0:
+                    select.append("CAST(NNNNN_kmers AS REAL) / kmers AS NNNNN_freq")
+                    where.append(f"NNNNN_freq <= {max_NNNNN_freq}")
+                if max_mismatching_freq < 1.0:
+                    select.append("CAST(mismatch_kmers AS REAL) / kmers AS mismatch_freq")
+                    where.append(f"mismatch_freq <= {max_mismatching_freq}")
+                if max_missing_freq < 1.0:
+                    select.append("CAST(missing_kmers AS REAL) / kmers AS missing_freq")
+                    where.append(f"missing_freq <= {max_missing_freq}")
+
+            query = "SELECT %s FROM reads LEFT JOIN transcripts ON transcriptid = transcripts.id" % \
+                ", ".join(select)
+            if where:
+                query += " WHERE %s" % " AND ".join(where)
+
+            # dict. structure: transcript -> condition -> sample -> list of reads
+            ref_reads = {}
+            logger.info("Querying reads from DB")
+            try:
+                db.cursor.execute(query)
+                for row in db.cursor:
+                    read_id = row["readid"]
+                    sample_id = row["sampleid"]
+                    condition = cond_dict[db_samples[sample_id]]
+                    ref_id = row["transcriptname"]
+                    ref_reads.setdefault(ref_id, {}).setdefault(condition, {}).\
+                        setdefault(sample_id, []).append(read_id)
+            except Exception:
+                logger.error("Error querying reads from DB")
+                raise Exception
 
         # Filtering at transcript level
         logger.info("Filtering out references with low coverage")
@@ -197,24 +186,6 @@ def ref_id_list(self):
 
     #~~~~~~~~~~~~~~PRIVATE METHODS~~~~~~~~~~~~~~#
 
-    def __open_db_connection(self):
-        try:
-            logger.debug("Connecting to DB")
-            self.__connection = sqlite3.connect(self.__db_path)
-            self.__connection.row_factory = sqlite3.Row
-            self.__cursor = self.__connection.cursor()
-        except:
-            logger.error("Error connecting to database")
-            raise
-
-    def __close_db_connection(self):
-        if self.__connection:
-            logger.debug("Closing connection to DB")
-            self.__connection.commit()
-            self.__connection.close()
-            self.__connection = None
-            self.__cursor = None
-
     def __select_ref(self,
         ref_reads,
         min_coverage,

From 6121376ad845c6e143e6f379ce081535b229a278 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 17 Mar 2021 13:24:40 +0000
Subject: [PATCH 09/49] Whitelist: fix filtering condition for sample subsets

---
 nanocompore/Whitelist.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index 5adfbdd..4502680 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -88,11 +88,18 @@ def __init__(self,
         with DatabaseWrapper(db_path) as db:
             db_samples = db.get_samples(sample_dict)
 
+            # How many samples are in the DB? If we want all, we don't need a constraint below.
+            try:
+                db_sample_count = db.cursor.execute("SELECT COUNT(*) FROM samples").fetchone()[0]
+            except Exception:
+                logger.error("Error counting samples in database")
+                raise Exception
+
             # Set up filters by adding conditions for DB query
             select = ["reads.id AS readid", "sampleid", "transcriptid", "transcripts.name AS transcriptname"]
             where = []
             # Get reads only from a subset of samples?
-            if len(cond_dict) != len(db_samples):
+            if len(db_samples) < db_sample_count:
                 where = ["sampleid IN (%s)" % ", ".join(map(str, db_samples))]
 
             if select_ref_id:
@@ -124,7 +131,7 @@ def __init__(self,
 
             # dict. structure: transcript -> condition -> sample -> list of reads
             ref_reads = {}
-            logger.info("Querying reads from DB")
+            logger.info("Querying reads from database")
             try:
                 db.cursor.execute(query)
                 for row in db.cursor:
@@ -135,7 +142,7 @@ def __init__(self,
                     ref_reads.setdefault(ref_id, {}).setdefault(condition, {}).\
                         setdefault(sample_id, []).append(read_id)
             except Exception:
-                logger.error("Error querying reads from DB")
+                logger.error("Error querying reads from database")
                 raise Exception
 
         # Filtering at transcript level

From 989af13922f5ffeda743d7d97de56eb959432293 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 17 Mar 2021 20:19:39 +0000
Subject: [PATCH 10/49] consolidate database code in 'DataStore', remove
 'DatabaseWrapper' (not yet tested)

---
 nanocompore/DataStore.py           | 109 +++++++++++++++++++----------
 nanocompore/DatabaseWrapper.py     |  61 ----------------
 nanocompore/Eventalign_collapse.py |   3 +-
 nanocompore/Whitelist.py           |   4 +-
 4 files changed, 76 insertions(+), 101 deletions(-)
 delete mode 100644 nanocompore/DatabaseWrapper.py

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index c27e779..9d55726 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -1,16 +1,17 @@
 # -*- coding: utf-8 -*-
 
-from collections import *
+from enum import Enum
 import datetime
 import os
-import sqlite3 as lite
+import sqlite3
+import contextlib
 
 # Third party
 from loguru import logger
-import nanocompore as pkg
+from nanocompre.common import NanoporeError
 
 class DataStore(object):
-    """ Init analysis and check args"""
+    """Store Nanocompore data in an SQLite database"""
 
     create_reads_query = ("CREATE TABLE IF NOT EXISTS reads ("
                           "id INTEGER NOT NULL PRIMARY KEY,"
@@ -63,55 +64,60 @@ class DataStore(object):
                                 ")"
                                 )
 
+    class DBCreateMode(Enum):
+        """Options for handling (non-) existence of the SQLite database file"""
+        MUST_EXIST = "r" # open for reading, error if file doesn't exist
+        CREATE_MAYBE = "a" # use an existing database, otherwise create one
+        OVERWRITE = "w" # always create a new database, overwrite if it exists
 
-    def __init__(self, db_path:str):
-        self.__db_path=db_path
-        db_is_new = not os.path.exists(self.__db_path)
-        if db_is_new:
-            logger.info("Creating new database")
-        else:
-            logger.info("Using existing database")
-        if db_is_new: self.__init_db()
-
-    def __enter__(self):
-        self.__open_db_connection()
-        return self
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.__connection.commit()
-        self.__close_db_connection()
+    def __init__(self,
+                 db_path:str,
+                 create_mode=DBCreateMode.MUST_EXIST):
+        self.__db_path = db_path
+        self.__create_mode = create_mode
+        self.__connection = None
+        self.__cursor = None
 
-    def __open_db_connection(self):
+    def __enter__(self):
+        if self.__create_mode == DBCreateMode.MUST_EXIST and not os.path.exists(self.__db_path):
+            raise NanocomporeError(f"Database file '{self.__db_path}' does not exist")
+        if self.__create_mode == DBCreateMode.OVERWRITE:
+            with contextlib.suppress(FileNotFoundError): # file may not exist
+                os.remove(self.__db_path)
+                logger.debug(f"Removed existing database file '{self.__db_path}'")
         try:
-            logger.debug("Connecting to DB")
-            self.__connection = lite.connect(self.__db_path)
+            logger.debug("Connecting to database")
+            self.__connection = sqlite3.connect(self.__db_path)
+            self.__connection.row_factory = sqlite3.Row
             self.__cursor = self.__connection.cursor()
         except:
             logger.error("Error connecting to database")
             raise
+        if self.__create_mode == DBCreateMode.OVERWRITE or \
+           (self.__create_mode == DBCreateMode.CREATE_MAYBE and not os.path.exists(self.__db_path)):
+            self.__init_db()
+        return self
 
-    def __close_db_connection(self):
+    def __exit__(self, exc_type, exc_value, traceback):
         if self.__connection:
-            logger.debug("Closing connection to DB")
+            logger.debug("Closing database connection")
             self.__connection.commit()
             self.__connection.close()
             self.__connection = None
             self.__cursor = None
 
-    def __init_db(self):
-        logger.debug("Setting up DB tables")
-        self.__open_db_connection()
+   def __init_db(self):
+        logger.debug("Setting up database tables")
         try:
             self.__cursor.execute(self.create_reads_query)
             self.__cursor.execute(self.create_kmers_query)
             self.__cursor.execute(self.create_samples_query)
             self.__cursor.execute(self.create_transcripts_query)
+            self.__connection.commit()
         except:
-            self.__close_db_connection()
-            logger.error("Error creating tables")
+            logger.error("Error creating database tables")
             raise
-        self.__connection.commit()
-        self.__close_db_connection()
 
     def store_read(self, read):
         """
@@ -130,7 +136,7 @@ def store_read(self, read):
                                   values)
             read_id = self.__cursor.lastrowid
         except Exception:
-            logger.error("Error inserting read into DB")
+            logger.error("Error inserting read into database")
             raise Exception
 
         for kmer in read.kmer_l:
@@ -152,7 +158,7 @@ def __store_kmer(self, kmer, read_id):
                                res["num_signals"], res["status"], res["dwell_time"],
                                res["NNNNN_dwell_time"], res["mismatch_dwell_time"], res["median"], res["mad"]))
         except Exception:
-            logger.error("Error inserting kmer into DB")
+            logger.error("Error inserting kmer into database")
             raise Exception
 
     def get_transcript_id_by_name(self, tx_name, create_if_not_exists=False):
@@ -169,7 +175,7 @@ def get_transcript_id_by_name(self, tx_name, create_if_not_exists=False):
             try:
                 self.__cursor.execute(query)
             except Exception:
-                logger.error("There was an error while inserting a new transcript in the DB")
+                logger.error("Error while inserting transcript into the database")
                 raise Exception
 
         query = f"SELECT id from transcripts WHERE name = '{tx_name}'"
@@ -178,7 +184,7 @@ def get_transcript_id_by_name(self, tx_name, create_if_not_exists=False):
             record = self.__cursor.fetchone()
             self.__connection.commit()
         except Exception:
-            logger.error("There was an error while selecting the transcript_id from the DB")
+            logger.error("Error while selecting transcript ID from the database")
             raise Exception
         if record is not None:
             return record[0]
@@ -199,7 +205,7 @@ def get_sample_id_by_name(self, sample_name, create_if_not_exists=False):
             try:
                 self.__cursor.execute(query)
             except Exception:
-                logger.error("There was an error while inserting a new sample in the DB")
+                logger.error("Error while inserting sample into the database")
                 raise Exception
 
         query = f"SELECT id from samples WHERE name = '{sample_name}'"
@@ -208,9 +214,38 @@ def get_sample_id_by_name(self, sample_name, create_if_not_exists=False):
             record = self.__cursor.fetchone()
             self.__connection.commit()
         except Exception:
-            logger.error("There was an error while selecting the sample_id from the DB")
+            logger.error("Error while selecting sample ID from the database")
             raise Exception
         if record is not None:
             return record[0]
         else:
             return None
+
+    @property
+    def cursor(self):
+        return self.__cursor
+
+    def get_samples(self, sample_dict=None):
+        if not self.__connection:
+            raise NanocomporeError("Database connection not yet opened")
+        expected_samples = []
+        if sample_dict: # query only relevant samples
+            for samples in sample_dict.values():
+                expected_samples += samples
+            if not expected_samples:
+                raise NanocomporeError("No sample names in 'sample_dict'")
+            where = " WHERE name IN ('%s')" % "', '".join(expected_samples)
+        else:
+            where = ""
+        db_samples = {}
+        try:
+            self.__cursor.execute("SELECT * FROM samples" + where)
+            for row in self.__cursor:
+                db_samples[row["id"]] = row["name"]
+        except Exception:
+            logger.error("Error reading sample names from database")
+            raise Exception
+        for sample in expected_samples: # check that requested samples are in DB
+            if sample not in db_samples.values():
+                raise NanocomporeError(f"Sample '{sample}' not present in database")
+        return db_samples
diff --git a/nanocompore/DatabaseWrapper.py b/nanocompore/DatabaseWrapper.py
deleted file mode 100644
index 0b5f2a1..0000000
--- a/nanocompore/DatabaseWrapper.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import sqlite3
-from loguru import logger
-from nanocompore.common import NanocomporeError
-
-
-class DatabaseWrapper(object):
-
-    def __init__(self, db_path):
-        self.__db_path = db_path
-        self.__connection = None
-        self.__cursor = None
-
-    def __enter__(self):
-        try:
-            logger.debug("Connecting to database")
-            self.__connection = sqlite3.connect(self.__db_path)
-            self.__connection.row_factory = sqlite3.Row
-            self.__cursor = self.__connection.cursor()
-        except:
-            logger.error("Error connecting to database")
-            raise
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if self.__connection:
-            logger.debug("Closing database connection")
-            self.__connection.commit()
-            self.__connection.close()
-            self.__connection = None
-            self.__cursor = None
-
-    @property
-    def cursor(self):
-        return self.__cursor
-
-    def get_samples(self, sample_dict=None):
-        if not self.__connection:
-            raise NanocomporeError("Database connection not yet opened")
-        expected_samples = []
-        if sample_dict: # query only relevant samples
-            for samples in sample_dict.values():
-                expected_samples += samples
-            if not expected_samples:
-                raise NanocomporeError("No sample names in 'sample_dict'")
-            where = " WHERE name IN ('%s')" % "', '".join(expected_samples)
-        else:
-            where = ""
-        db_samples = {}
-        try:
-            self.cursor.execute("SELECT * FROM samples" + where)
-            for row in self.cursor:
-                db_samples[row["id"]] = row["name"]
-        except Exception:
-            logger.error("Error reading sample names from database")
-            raise Exception
-        for sample in expected_samples: # check that requested samples are in DB
-            if sample not in db_samples.values():
-                raise NanocomporeError(f"Sample '{sample}' not present in database")
-        return db_samples
diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index 826f941..bd80b6f 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -241,8 +241,9 @@ def __write_output_to_db (self, out_q, error_q):
         pr = profile.Profile()
         pr.enable()
         n_reads = 0
+        db_path = os.path.join(self.__outpath, self.__outprefix+"_nanocompore.db")
         try:
-            with DataStore(db_path=os.path.join(self.__outpath, self.__outprefix+"_nanocompore.db")) as datastore, tqdm (unit=" reads") as pbar:
+            with DataStore(db_path, DataStore.DBCreateMode.CREATE_MAYBE) as datastore, tqdm (unit=" reads") as pbar:
                 # Iterate over out queue until nthread poison pills are found
                 for _ in range (self.__nthreads):
                     for read in iter (out_q.get, None):
diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index 4502680..d6428fe 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -15,7 +15,7 @@
 
 # Local package
 from nanocompore.common import *
-from nanocompore.DatabaseWrapper import DatabaseWrapper
+from nanocompore.DataStore import DataStore
 
 # Set global random seed
 downsample_random_seed = 42
@@ -85,7 +85,7 @@ def __init__(self,
             raise NanocomporeError("The fasta file cannot be opened")
 
         # Database interaction
-        with DatabaseWrapper(db_path) as db:
+        with DataStore(db_path, DataStore.DBCreateMode.MUST_EXIST) as db:
             db_samples = db.get_samples(sample_dict)
 
             # How many samples are in the DB? If we want all, we don't need a constraint below.

From 6dbbdb2e1d515191f29a56c4b0e39a0edaa12b13 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 23 Mar 2021 19:47:07 +0000
Subject: [PATCH 11/49] DataStore: small fixes, move 'DBCreateMode' (enum) to
 top level

---
 nanocompore/DataStore.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 9d55726..1749ff7 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -8,7 +8,15 @@
 
 # Third party
 from loguru import logger
-from nanocompre.common import NanoporeError
+from nanocompore.common import NanocomporeError
+
+
+class DBCreateMode(Enum):
+    """Options for handling (non-) existence of the SQLite database file"""
+    MUST_EXIST = "r" # open for reading, error if file doesn't exist
+    CREATE_MAYBE = "a" # use an existing database, otherwise create one
+    OVERWRITE = "w" # always create a new database, overwrite if it exists
+
 
 class DataStore(object):
     """Store Nanocompore data in an SQLite database"""
@@ -50,13 +58,14 @@ class DataStore(object):
                           ")"
                           )
     # TODO: 'sequence' is stored redundantly - move it to a separate table
+    # TODO: encode 'status' as int to save space (foreign key referencing a table with all possible statuses)
 
     create_samples_query = ("CREATE TABLE IF NOT EXISTS samples ("
                             "id INTEGER NOT NULL PRIMARY KEY,"
                             "name VARCHAR NOT NULL UNIQUE"
                             ")"
                             )
-
+    # TODO: add 'condition' column
 
     create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
                                 "id INTEGER NOT NULL PRIMARY KEY,"
@@ -64,13 +73,6 @@ class DataStore(object):
                                 ")"
                                 )
 
-    class DBCreateMode(Enum):
-        """Options for handling (non-) existence of the SQLite database file"""
-        MUST_EXIST = "r" # open for reading, error if file doesn't exist
-        CREATE_MAYBE = "a" # use an existing database, otherwise create one
-        OVERWRITE = "w" # always create a new database, overwrite if it exists
-
-
     def __init__(self,
                  db_path:str,
                  create_mode=DBCreateMode.MUST_EXIST):
@@ -107,7 +109,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             self.__connection = None
             self.__cursor = None
 
-   def __init_db(self):
+    def __init_db(self):
         logger.debug("Setting up database tables")
         try:
             self.__cursor.execute(self.create_reads_query)

From 031a9955f000966b4496a1e5464bbdcea9e66559 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 23 Mar 2021 19:48:10 +0000
Subject: [PATCH 12/49] Eventalign_collapse: small fix in 'DataStore' call

---
 nanocompore/Eventalign_collapse.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index bd80b6f..dd03b42 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -19,7 +19,7 @@
 # Local imports
 from nanocompore.common import *
 from nanocompore.SuperParser import SuperParser
-from nanocompore.DataStore import DataStore
+from nanocompore.DataStore import DataStore, DBCreateMode
 
 # Disable multithreading for MKL and openBlas
 os.environ["MKL_NUM_THREADS"] = "1"
@@ -90,6 +90,7 @@ def __init__(self,
         self.__change_colnames = {"contig":"ref_id" ,"position":"ref_pos", "read_name":"read_id", "samples":"sample_list", "event_length":"dwell_time"}
         self.__cast_colnames = {"ref_pos":int, "dwell_time":np.float32, "sample_list":lambda x: [float(i) for i in x.split(",")]}
 
+
     def __call__(self):
         """
         Run the analysis
@@ -140,6 +141,7 @@ def __call__(self):
                 logger.error("An error occured while trying to kill processes\n")
             raise E
 
+
     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PRIVATE METHODS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
     def __split_reads (self, in_q, error_q):
         """
@@ -195,6 +197,7 @@ def __split_reads (self, in_q, error_q):
                 in_q.put(None)
             logger.debug("Parsed Reads:{} Events:{}".format(n_reads, n_events))
 
+
     def __process_read (self, in_q, out_q, error_q):
         """
         Multi-threaded workers collapsing events at kmer level
@@ -232,7 +235,8 @@ def __process_read (self, in_q, out_q, error_q):
             logger.debug("Processed Reads:{} Kmers:{} Events:{} Signals:{}".format(n_reads, n_kmers, n_events, n_signals))
             out_q.put(None)
 
-    def __write_output_to_db (self, out_q, error_q):
+
+    def __write_output_to_db(self, out_q, error_q):
         """
         Mono-threaded Writer
         """
@@ -243,7 +247,7 @@ def __write_output_to_db (self, out_q, error_q):
         n_reads = 0
         db_path = os.path.join(self.__outpath, self.__outprefix+"_nanocompore.db")
         try:
-            with DataStore(db_path, DataStore.DBCreateMode.CREATE_MAYBE) as datastore, tqdm (unit=" reads") as pbar:
+            with DataStore(self.__db_path, DBCreateMode.CREATE_MAYBE) as datastore, tqdm (unit=" reads") as pbar:
                 # Iterate over out queue until nthread poison pills are found
                 for _ in range (self.__nthreads):
                     for read in iter (out_q.get, None):
@@ -261,6 +265,7 @@ def __write_output_to_db (self, out_q, error_q):
             pr.disable()
             pr.dump_stats("prof")
 
+
     def __write_output (self, out_q, error_q):
         """
         Mono-threaded Writer

From 9cc08069bcbbd376e57cf56aa8c2e924ab001cc2 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 23 Mar 2021 19:49:46 +0000
Subject: [PATCH 13/49] Whitelist: updated 'DataStore' call

---
 nanocompore/Whitelist.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index d6428fe..a0b6183 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -85,7 +85,7 @@ def __init__(self,
             raise NanocomporeError("The fasta file cannot be opened")
 
         # Database interaction
-        with DataStore(db_path, DataStore.DBCreateMode.MUST_EXIST) as db:
+        with DataStore(db_path) as db:
             db_samples = db.get_samples(sample_dict)
 
             # How many samples are in the DB? If we want all, we don't need a constraint below.
@@ -129,7 +129,7 @@ def __init__(self,
             if where:
                 query += " WHERE %s" % " AND ".join(where)
 
-            # dict. structure: transcript -> condition -> sample -> list of reads
+            # Dict. structure: transcript -> condition -> sample -> list of reads
             ref_reads = {}
             logger.info("Querying reads from database")
             try:
@@ -181,7 +181,7 @@ def __len__(self):
 
     def __iter__(self):
         for i, j in self.ref_reads.items():
-            yield(i,j)
+            yield (i, j)
 
     def __getitem__(self, items):
         return self.ref_reads.get(items, None)

From a1cb349b6f49ad9a4f6c30e1ac0cd8fcd1adf241 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 23 Mar 2021 19:54:39 +0000
Subject: [PATCH 14/49] SampComp: get data from SQLite DB; some refactoring

---
 nanocompore/SampComp.py | 368 ++++++++++++++++------------------------
 1 file changed, 142 insertions(+), 226 deletions(-)

diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index 108f779..41771d9 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -18,6 +18,7 @@
 
 # Local package
 from nanocompore.common import *
+from nanocompore.DataStore import DataStore
 from nanocompore.Whitelist import Whitelist
 from nanocompore.TxComp import txCompare
 from nanocompore.SampCompDB import SampCompDB
@@ -37,36 +38,38 @@ class SampComp(object):
     #~~~~~~~~~~~~~~FUNDAMENTAL METHODS~~~~~~~~~~~~~~#
 
     def __init__(self,
-        eventalign_fn_dict:dict,
-        fasta_fn:str,
-        bed_fn:str = None,
-        outpath:str = "results",
-        outprefix:str = "out_",
-        overwrite:bool = False,
-        whitelist:Whitelist = None,
-        comparison_methods:list = ["GMM", "KS"],
-        logit:bool = True,
-        anova:bool = False,
-        allow_warnings:bool = False,
-        sequence_context:int = 0,
-        sequence_context_weights:str = "uniform",
-        min_coverage:int = 30,
-        min_ref_length:int = 100,
-        downsample_high_coverage:int = 5000,
-        max_invalid_kmers_freq:float = 0.1,
-        select_ref_id:list = [],
-        exclude_ref_id:list = [],
-        nthreads:int = 3,
-        progress:bool = False):
+                 db_path:str,
+                 sample_dict:dict,
+                 fasta_fn:str,
+                 bed_fn:str = None,
+                 outpath:str = "results",
+                 outprefix:str = "out",
+                 overwrite:bool = False,
+                 whitelist:Whitelist = None,
+                 comparison_methods:list = ["GMM", "KS"],
+                 logit:bool = True,
+                 anova:bool = False,
+                 allow_warnings:bool = False,
+                 sequence_context:int = 0,
+                 sequence_context_weights:str = "uniform",
+                 min_coverage:int = 30,
+                 min_ref_length:int = 100,
+                 downsample_high_coverage:int = 5000,
+                 max_invalid_kmers_freq:float = 0.1,
+                 select_ref_id:list = [],
+                 exclude_ref_id:list = [],
+                 nthreads:int = 3,
+                 progress:bool = False):
 
         """
         Initialise a `SampComp` object and generates a white list of references with sufficient coverage for subsequent analysis.
         The retuned object can then be called to start the analysis.
-        * eventalign_fn_dict
-            Multilevel dictionnary indicating the condition_label, sample_label and file name of the eventalign_collapse output.
-            2 conditions are expected and at least 2 sample replicates per condition are highly recommended.
-            One can also pass YAML file describing the samples instead.
-            Example `d = {"S1": {"R1":"path1.tsv", "R2":"path2.tsv"}, "S2": {"R1":"path3.tsv", "R2":"path4.tsv"}}`
+        Args:
+        * db_path
+            Path to the SQLite database file with event-aligned read/kmer data
+        * sample_dict
+            Dictionary containing lists of (unique) sample names, grouped by condition
+            example d = {"control": ["C1", "C2"], "treatment": ["T1", "T2"]}
         * outpath
             Path to the output folder.
         * outprefix
@@ -113,17 +116,18 @@ def __init__(self,
         # Save init options in dict for later
         log_init_state(loc=locals())
 
-        # If eventalign_fn_dict is not a dict try to load a YAML file instead
-        if type(eventalign_fn_dict) == str:
+        # TODO: remove this? (may be better handled in '__main__.py', if needed)
+        # If 'sample_dict' is not a dict try to load a YAML file instead
+        if type(sample_dict) == str:
             logger.debug("Parsing YAML file")
-            if not access_file(eventalign_fn_dict):
-                raise NanocomporeError("{} is not a valid file".format(eventalign_fn_dict))
-            with open(eventalign_fn_dict, "r") as fp:
-                eventalign_fn_dict = yaml.load(fp, Loader=yaml.SafeLoader)
+            if not access_file(sample_dict):
+                raise NanocomporeError("{} is not a valid file".format(sample_dict))
+            with open(sample_dict, "r") as fp:
+                sample_dict = yaml.load(fp, Loader=yaml.SafeLoader)
 
         # Check eventalign_dict file paths and labels
-        eventalign_fn_dict = self.__check_eventalign_fn_dict(eventalign_fn_dict)
-        logger.debug(eventalign_fn_dict)
+        check_sample_dict(sample_dict)
+        logger.debug(sample_dict)
 
         # Check if fasta and bed files exist
         if not access_file(fasta_fn):
@@ -153,15 +157,15 @@ def __init__(self,
                     raise NanocomporeError("Invalid comparison method {}".format(method))
 
         if not whitelist:
-            whitelist = Whitelist(
-                eventalign_fn_dict = eventalign_fn_dict,
-                fasta_fn = fasta_fn,
-                min_coverage = min_coverage,
-                min_ref_length = min_ref_length,
-                downsample_high_coverage = downsample_high_coverage,
-                max_invalid_kmers_freq = max_invalid_kmers_freq,
-                select_ref_id = select_ref_id,
-                exclude_ref_id = exclude_ref_id)
+            whitelist = Whitelist(db_path,
+                                  sample_dict,
+                                  fasta_fn,
+                                  min_coverage = min_coverage,
+                                  min_ref_length = min_ref_length,
+                                  downsample_high_coverage = downsample_high_coverage,
+                                  max_invalid_kmers_freq = max_invalid_kmers_freq,
+                                  select_ref_id = select_ref_id,
+                                  exclude_ref_id = exclude_ref_id)
         elif not isinstance(whitelist, Whitelist):
             raise NanocomporeError("Whitelist is not valid")
 
@@ -171,10 +175,11 @@ def __init__(self,
         self.__max_invalid_kmers_freq = whitelist._Whitelist__max_invalid_kmers_freq
 
         # Save private args
-        self.__eventalign_fn_dict = eventalign_fn_dict
-        self.__db_fn = os.path.join(outpath, outprefix+"SampComp.db")
+        self.__db_path = db_path
+        self.__sample_dict = sample_dict
         self.__fasta_fn = fasta_fn
         self.__bed_fn = bed_fn
+        self.__db_fn = os.path.join(outpath, outprefix + "_SampComp.db")
         self.__whitelist = whitelist
         self.__comparison_methods = comparison_methods
         self.__logit = logit
@@ -186,11 +191,10 @@ def __init__(self,
         self.__progress = progress
 
         # Get number of samples
-        n = 0
-        for sample_dict in self.__eventalign_fn_dict.values():
-            for sample_lab in sample_dict.keys():
-                n+=1
-        self.__n_samples = n
+        self.__n_samples = 0
+        for samples in sample_dict.values():
+            self.__n_samples += len(samples)
+
 
     def __call__(self):
         """
@@ -244,6 +248,68 @@ def __call__(self):
                 logger.error("An error occured while trying to kill processes\n")
             raise E
 
+
+    def process_transcript(self, tx_id, whitelist_reads):
+        """Process a transcript given filtered reads from Whitelist"""
+        logger.debug("Processing transcript: {tx_id}")
+
+        # Kmer data from whitelisted reads from all samples for this transcript
+        # Structure: kmer position -> condition -> sample -> data
+        kmer_data = defaultdict(lambda: {condition:
+                                    defaultdict(lambda: {"intensity": [],
+                                                    "dwell": [],
+                                                    "coverage": 0,
+                                                    "kmers_stats": {"valid": 0,
+                                                                    # "missing": 0, # TODO: needed?
+                                                                    "NNNNN": 0,
+                                                                    "mismatching": 0}})
+                                    for condition in self.__sample_dict})
+        n_reads = n_kmers = 0
+
+        # Read kmer data from database
+        with DataStore(self.__db_path) as db:
+            for cond_lab, sample_dict in whitelist_reads.items():
+                for sample_id, read_ids in sample_dict.items():
+                    if not read_ids: continue # TODO: error?
+                    n_reads += len(read_ids)
+                    values = ", ".join([str(read_id) for read_id in read_ids])
+                    query = f"SELECT * FROM kmers WHERE readid IN ({values})"
+                    for row in db.cursor.execute(query):
+                        n_kmers += 1
+                        pos = row["position"]
+                        # TODO: check that kmer seq. agrees with FASTA?
+                        data = kmer_data[pos][cond_lab][sample_id]
+                        data["intensity"].append(row["median"])
+                        data["dwell"].append(row["dwell_time"])
+                        data["coverage"] += 1
+                        status = row["status"]
+                        data["kmers_stats"][status] += 1
+
+        logger.debug(f"Data loaded for transcript: {tx_id}")
+        test_results = {}
+        if self.__comparison_methods:
+            random_state = np.random.RandomState(seed=42)
+            test_results = txCompare(tx_id,
+                                     kmer_data,
+                                     random_state=random_state,
+                                     methods=self.__comparison_methods,
+                                     sequence_context=self.__sequence_context,
+                                     sequence_context_weights=self.__sequence_context_weights,
+                                     min_coverage= self.__min_coverage,
+                                     allow_warnings=self.__allow_warnings,
+                                     logit=self.__logit,
+                                     anova=self.__anova)
+
+        # Remove 'default_factory' functions from 'kmer_data' to enable pickle/multiprocessing
+        kmer_data.default_factory = None
+        for pos_dict in kmer_data.values():
+            for cond_dict in pos_dict.values():
+                cond_dict.default_factory = None
+
+        return {"kmer_data": kmer_data, "test_results": test_results,
+                "n_reads": n_reads, "n_kmers": n_kmers}
+
+
     #~~~~~~~~~~~~~~PRIVATE MULTIPROCESSING METHOD~~~~~~~~~~~~~~#
     def __list_refid(self, in_q, error_q):
         """Add valid refid from whitelist to input queue to dispatch the data among the workers"""
@@ -252,7 +318,7 @@ def __list_refid(self, in_q, error_q):
             for ref_id, ref_dict in self.__whitelist:
                 logger.debug("Adding {} to in_q".format(ref_id))
                 in_q.put((ref_id, ref_dict))
-                n_tx+=1
+                n_tx += 1
 
         # Manage exceptions and add error trackback to error queue
         except Exception:
@@ -265,101 +331,26 @@ def __list_refid(self, in_q, error_q):
                 in_q.put(None)
             logger.debug("Parsed transcripts:{}".format(n_tx))
 
+
     def __process_references(self, in_q, out_q, error_q):
         """
         Consume ref_id, agregate intensity and dwell time at position level and
         perform statistical analyses to find significantly different regions
         """
-        n_tx = n_reads = n_lines = 0
+        n_tx = n_reads = n_kmers = 0
         try:
             logger.debug("Worker thread started")
-            # Open all files for reading. File pointer are stored in a dict matching the ref_dict entries
-            fp_dict = self.__eventalign_fn_open()
-
-            # Process refid in input queue
+            # Process references in input queue
             for ref_id, ref_dict in iter(in_q.get, None):
-                logger.debug("Worker thread processing new item from in_q: {}".format(ref_id))
-                # Create an empty dict for all positions first
-                ref_pos_list = self.__make_ref_pos_list(ref_id)
-
-                for cond_lab, sample_dict in ref_dict.items():
-                    for sample_lab, read_list in sample_dict.items():
-                        fp = fp_dict[cond_lab][sample_lab]
-
-                        for read in read_list:
-
-                            # Move to read, save read data chunk and reset file pointer
-                            fp.seek(read["byte_offset"])
-                            line_list = fp.read(read["byte_len"]).split("\n")
-                            fp.seek(0)
-
-                            # Check read_id ref_id concordance between index and data file
-                            header = numeric_cast_list(line_list[0][1:].split("\t"))
-                            if not header[0] == read["read_id"] or not header[1] == read["ref_id"]:
-                                raise NanocomporeError("Index and data files are not matching:\n{}\n{}".format(header, read))
-
-                            # Extract col names from second line
-                            col_names = line_list[1].split("\t")
-                            # Check that all required fields are present
-                            if not all_values_in (["ref_pos", "ref_kmer", "median", "dwell_time"], col_names):
-                                raise NanocomporeError("Required fields not found in the data file: {}".format(col_names))
-                            # Verify if kmers events stats values are present or not
-                            kmers_stats = all_values_in (["NNNNN_dwell_time", "mismatch_dwell_time"], col_names)
-
-                            # Parse data files kmers per kmers
-                            prev_pos = None
-                            for line in line_list[2:]:
-                                # Transform line to dict and cast str numbers to actual numbers
-                                kmer = numeric_cast_dict(keys=col_names, values=line.split("\t"))
-                                pos = kmer["ref_pos"]
-
-                                # Check consistance between eventalign data and reference sequence
-                                if kmer["ref_kmer"] != ref_pos_list[pos]["ref_kmer"]:
-                                    ref_pos_list[pos]["ref_kmer"] = ref_pos_list[pos]["ref_kmer"]+"!!!!"
-                                    #raise NanocomporeError ("Data reference kmer({}) doesn't correspond to the reference sequence ({})".format(ref_pos_list[pos]["ref_kmer"], kmer["ref_kmer"]))
-
-                                # Fill dict with the current pos values
-                                ref_pos_list[pos]["data"][cond_lab][sample_lab]["intensity"].append(kmer["median"])
-                                ref_pos_list[pos]["data"][cond_lab][sample_lab]["dwell"].append(kmer["dwell_time"])
-                                ref_pos_list[pos]["data"][cond_lab][sample_lab]["coverage"] += 1
-
-                                if kmers_stats:
-                                    # Fill in the missing positions
-                                    if prev_pos and pos-prev_pos > 1:
-                                        for missing_pos in range(prev_pos+1, pos):
-                                            ref_pos_list[missing_pos]["data"][cond_lab][sample_lab]["kmers_stats"]["missing"] += 1
-                                    # Also fill in with normalised position event stats
-                                    n_valid = (kmer["dwell_time"]-(kmer["NNNNN_dwell_time"]+kmer["mismatch_dwell_time"])) / kmer["dwell_time"]
-                                    n_NNNNN = kmer["NNNNN_dwell_time"] / kmer["dwell_time"]
-                                    n_mismatching = kmer["mismatch_dwell_time"] / kmer["dwell_time"]
-                                    ref_pos_list[pos]["data"][cond_lab][sample_lab]["kmers_stats"]["valid"] += n_valid
-                                    ref_pos_list[pos]["data"][cond_lab][sample_lab]["kmers_stats"]["NNNNN"] += n_NNNNN
-                                    ref_pos_list[pos]["data"][cond_lab][sample_lab]["kmers_stats"]["mismatching"] += n_mismatching
-                                    # Save previous position
-                                    prev_pos = pos
-
-                                n_lines+=1
-                            n_reads+=1
-
-                logger.debug("Data for {} loaded.".format(ref_id))
-                if self.__comparison_methods:
-                    random_state=np.random.RandomState(seed=42)
-                    ref_pos_list = txCompare(
-                        ref_id=ref_id,
-                        ref_pos_list=ref_pos_list,
-                        methods=self.__comparison_methods,
-                        sequence_context=self.__sequence_context,
-                        sequence_context_weights=self.__sequence_context_weights,
-                        min_coverage= self.__min_coverage,
-                        allow_warnings=self.__allow_warnings,
-                        logit=self.__logit,
-                        anova=self.__anova,
-                        random_state=random_state)
+                logger.debug(f"Worker thread processing new item from in_q: {ref_id}")
+                results = self.process_transcript(ref_id, ref_dict)
+                n_tx += 1
+                n_reads += results["n_reads"]
+                n_kmers += results["n_kmers"]
 
                 # Add the current read details to queue
-                logger.debug("Adding %s to out_q"%(ref_id))
-                out_q.put((ref_id, ref_pos_list))
-                n_tx+=1
+                logger.debug(f"Adding '{ref_id}' to out_q")
+                out_q.put((ref_id, results["kmer_data"], results["test_results"]))
 
         # Manage exceptions and add error trackback to error queue
         except Exception as e:
@@ -368,34 +359,36 @@ def __process_references(self, in_q, out_q, error_q):
 
         # Deal poison pill and close file pointer
         finally:
-            logger.debug("Processed Transcrits:{} Reads:{} Lines:{}".format(n_tx, n_reads, n_lines))
+            logger.debug(f"Processed {n_tx} transcripts, {n_reads} reads, {n_kmers} kmers")
             logger.debug("Adding poison pill to out_q")
-            self.__eventalign_fn_close(fp_dict)
             out_q.put(None)
 
+
     def __write_output(self, out_q, error_q):
         # Get results out of the out queue and write in shelve
         pvalue_tests = set()
         ref_id_list = []
         n_tx = n_pos = 0
         try:
-            with shelve.open(self.__db_fn, flag='n') as db, tqdm(total=len(self.__whitelist), unit=" Processed References", disable= not self.__progress) as pbar:
+            with shelve.open(self.__db_fn, flag='n') as db, \
+                 tqdm(total=len(self.__whitelist), unit=" Processed References",
+                      disable=not self.__progress) as pbar:
                 # Iterate over the counter queue and process items until all poison pills are found
                 for _ in range(self.__nthreads):
-                    for ref_id, ref_pos_list in iter(out_q.get, None):
+                    for ref_id, kmer_data, test_results in iter(out_q.get, None):
                         ref_id_list.append(ref_id)
                         logger.debug("Writer thread writing %s"%ref_id)
-                        # Get pvalue fields available in analysed data before
-                        for pos_dict in ref_pos_list:
-                            if 'txComp' in pos_dict:
-                                for res in pos_dict['txComp'].keys():
+                        # Get pvalue fields available in analysed data
+                        for res_dict in test_results.values():
+                            if "txComp" in res_dict:
+                                for res in res_dict["txComp"].keys():
                                     if "pvalue" in res:
-                                        n_pos+=1
+                                        n_pos += 1
                                         pvalue_tests.add(res)
                         # Write results in a shelve db
-                        db [ref_id] = ref_pos_list
+                        db[ref_id] = (kmer_data, test_results)
                         pbar.update(1)
-                        n_tx+=1
+                        n_tx += 1
 
                 # Write list of refid
                 db["__ref_id_list"] = ref_id_list
@@ -417,84 +410,7 @@ def __write_output(self, out_q, error_q):
             error_q.put(traceback.format_exc())
 
         finally:
-            logger.debug("Written Transcripts:{} Valid positions:{}".format(n_tx, n_pos))
-            logger.info ("All Done. Transcripts processed: {}".format(n_tx))
+            logger.debug(f"Wrote {n_tx} transcripts, {n_pos} valid positions")
+            logger.info(f"All done. Transcripts processed: {n_tx}")
             # Kill error queue with poison pill
             error_q.put(None)
-
-    #~~~~~~~~~~~~~~PRIVATE HELPER METHODS~~~~~~~~~~~~~~#
-    def __check_eventalign_fn_dict(self, d):
-        """"""
-        # Check that the number of condition is 2 and raise a warning if there are less than 2 replicates per conditions
-        if len(d) != 2:
-            raise NanocomporeError("2 conditions are expected. Found {}".format(len(d)))
-        for cond_lab, sample_dict in d.items():
-            if len(sample_dict) == 1:
-                logger.info("Only 1 replicate found for condition {}".format(cond_lab))
-                logger.info("This is not recommended. The statistics will be calculated with the logit method")
-
-        # Test if files are accessible and verify that there are no duplicated replicate labels
-        duplicated_lab = False
-        rep_lab_list = []
-        rep_fn_list = []
-        for cond_lab, sd in d.items():
-            for rep_lab, fn in sd.items():
-                if not access_file(fn):
-                    raise NanocomporeError("Cannot access eventalign file: {}".format(fn))
-                if fn in rep_fn_list:
-                    raise NanocomporeError("Duplicated eventalign file detected: {}".format(fn))
-                if rep_lab in rep_lab_list:
-                    duplicated_lab = True
-                rep_lab_list.append(rep_lab)
-                rep_fn_list.append(fn)
-        if not duplicated_lab:
-            return d
-
-        # If duplicated replicate labels found, prefix labels with condition name
-        else:
-            logger.debug("Found duplicated labels in the replicate names. Prefixing with condition name")
-            d_clean = OrderedDict()
-            for cond_lab, sd in d.items():
-                d_clean[cond_lab] = OrderedDict()
-                for rep_lab, fn in sd.items():
-                    d_clean[cond_lab]["{}_{}".format(cond_lab, rep_lab)] = fn
-            return d_clean
-
-    def __eventalign_fn_open(self):
-        """"""
-        fp_dict = OrderedDict()
-        for cond_lab, sample_dict in self.__eventalign_fn_dict.items():
-            fp_dict[cond_lab] = OrderedDict()
-            for sample_lab, fn in sample_dict.items():
-                fp_dict[cond_lab][sample_lab] = open(fn, "r")
-        return fp_dict
-
-    def __eventalign_fn_close(self, fp_dict):
-        """"""
-        for sample_dict in fp_dict.values():
-            for fp in sample_dict.values():
-                fp.close()
-
-    def __make_ref_pos_list(self, ref_id):
-        """"""
-        ref_pos_list = []
-        with Fasta(self.__fasta_fn) as fasta:
-            ref_fasta = fasta [ref_id]
-            ref_len = len(ref_fasta)
-            ref_seq = str(ref_fasta)
-
-            for pos in range(ref_len-4):
-                pos_dict = OrderedDict()
-                pos_dict["ref_kmer"] = ref_seq[pos:pos+5]
-                pos_dict["data"] = OrderedDict()
-                for cond_lab, s_dict in self.__eventalign_fn_dict.items():
-                    pos_dict["data"][cond_lab] = OrderedDict()
-                    for sample_lab in s_dict.keys():
-
-                        pos_dict["data"][cond_lab][sample_lab] = {
-                            "intensity":[],
-                            "dwell":[],
-                            "coverage":0,
-                            "kmers_stats":{"missing":0,"valid":0,"NNNNN":0,"mismatching":0}}
-                ref_pos_list.append(pos_dict)
-        return ref_pos_list

From fd0d4467ba5f7a8b20dad825a570aabb02f11a9c Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 23 Mar 2021 19:56:00 +0000
Subject: [PATCH 15/49] TxComp: update to changes in 'SampComp'; some
 refactoring

---
 nanocompore/TxComp.py | 216 +++++++++++++++++++++++-------------------
 1 file changed, 117 insertions(+), 99 deletions(-)

diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index 5898d5c..c58c56f 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -21,18 +21,18 @@
 from nanocompore.common import *
 
 
-def txCompare(
-    ref_id,
-    ref_pos_list,
-    random_state,
-    methods=None,
-    sequence_context=0,
-    min_coverage=20,
-    ref=None,
-    sequence_context_weights="uniform",
-    anova=True,
-    logit=False,
-    allow_warnings=False):
+# TODO: wrap this in a class
+def txCompare(ref_id,
+              kmer_data,
+              random_state,
+              methods=None,
+              sequence_context=0,
+              min_coverage=20,
+              ref=None,
+              sequence_context_weights="uniform",
+              anova=True,
+              logit=False,
+              allow_warnings=False):
     logger.debug("TxCompare")
 
     if sequence_context_weights != "uniform" and sequence_context_weights != "harmonic":
@@ -41,66 +41,63 @@ def txCompare(
     n_lowcov = 0
     tests = set()
     # If we have less than 2 replicates in any condition skip anova and force logit method
-    if not all([ len(i)>1 for i in ref_pos_list[0]['data'].values() ]):
-        anova=False
-        logit=True
-    for pos, pos_dict in enumerate(ref_pos_list):
+    # TODO: looking at the first kmer only may not be reliable - find a better way
+    if not all([len(samples) > 1 for samples in next(iter(kmer_data.values())).values()]):
+        anova = False
+        logit = True
+
+    results = defaultdict(dict)
+    for pos, pos_dict in kmer_data.items():
         logger.trace(f"Processing position {pos}")
         # Filter out low coverage positions
-        lowcov = False
-        for cond_dict in pos_dict["data"].values():
-            for sample_val in cond_dict.values():
-                if sample_val["coverage"] < min_coverage:
-                    lowcov=True
-        ref_pos_list[pos]["lowCov"]=lowcov
-
-        # Perform stat tests if not low cov
+        results[pos]["lowCov"] = lowcov = has_low_coverage(pos_dict, min_coverage)
         if lowcov:
-            logger.trace(f"Position {pos} is low coverage, skipping")
-            n_lowcov+=1
-        else:
-            res = dict()
-            data = pos_dict['data']
-            condition_labels = tuple(data.keys())
-            if len(condition_labels) != 2:
-                raise NanocomporeError("The %s method only supports two conditions" % method)
-            condition1_intensity = np.concatenate([ rep['intensity'] for rep in data[condition_labels[0]].values() ])
-            condition2_intensity = np.concatenate([ rep['intensity'] for rep in data[condition_labels[1]].values() ])
-            condition1_dwell = np.concatenate([ rep['dwell'] for rep in data[condition_labels[0]].values() ])
-            condition2_dwell = np.concatenate([ rep['dwell'] for rep in data[condition_labels[1]].values() ])
-
-            for met in methods:
-                logger.trace(f"Running {met} test on position {pos}")
-                if met in ["MW", "KS", "TT"] :
-                    try:
-                        pvalues = nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=met)
-                    except:
-                        raise NanocomporeError("Error doing {} test on reference {}".format(met, ref_id))
-                    res["{}_intensity_pvalue".format(met)]=pvalues[0]
-                    res["{}_dwell_pvalue".format(met)]=pvalues[1]
-                    tests.add("{}_intensity_pvalue".format(met))
-                    tests.add("{}_dwell_pvalue".format(met))
-                elif met == "GMM":
-                    try:
-                        gmm_results = gmm_test(data, anova=anova, logit=logit, allow_warnings=allow_warnings, random_state=random_state)
-                    except:
-                        raise NanocomporeError("Error doing GMM test on reference {}".format(ref_id))
-                    res["GMM_model"] = gmm_results['gmm']
-                    if anova:
-                        res["GMM_anova_pvalue"] = gmm_results['anova']['pvalue']
-                        res["GMM_anova_model"] = gmm_results['anova']
-                        tests.add("GMM_anova_pvalue")
-                    if logit:
-                        res["GMM_logit_pvalue"] = gmm_results['logit']['pvalue']
-                        res["GMM_logit_model"] = gmm_results['logit']
-                        tests.add("GMM_logit_pvalue")
-
-            # Calculate shift statistics
-            logger.trace(f"Calculatign shift stats for {pos}")
-            res['shift_stats'] = shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell)
-            # Save results in main
-            logger.trace(f"Saving test results for {pos}")
-            ref_pos_list[pos]['txComp'] = res
+            logger.trace(f"Position {pos} has low coverage, skipping")
+            n_lowcov += 1
+            continue
+
+        # Perform stat tests
+        res = dict()
+        condition_labels = tuple(pos_dict.keys())
+        if len(condition_labels) != 2:
+            raise NanocomporeError("The %s method only supports two conditions" % method)
+        condition1_intensity = np.concatenate([ rep['intensity'] for rep in pos_dict[condition_labels[0]].values() ])
+        condition2_intensity = np.concatenate([ rep['intensity'] for rep in pos_dict[condition_labels[1]].values() ])
+        condition1_dwell = np.concatenate([ rep['dwell'] for rep in pos_dict[condition_labels[0]].values() ])
+        condition2_dwell = np.concatenate([ rep['dwell'] for rep in pos_dict[condition_labels[1]].values() ])
+
+        for met in methods:
+            logger.trace(f"Running {met} test on position {pos}")
+            if met in ["MW", "KS", "TT"] :
+                try:
+                    pvalues = nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=met)
+                except:
+                    raise NanocomporeError("Error doing {} test on reference {}".format(met, ref_id))
+                res["{}_intensity_pvalue".format(met)]=pvalues[0]
+                res["{}_dwell_pvalue".format(met)]=pvalues[1]
+                tests.add("{}_intensity_pvalue".format(met))
+                tests.add("{}_dwell_pvalue".format(met))
+            elif met == "GMM":
+                try:
+                    gmm_results = gmm_test(pos_dict, anova=anova, logit=logit, allow_warnings=allow_warnings, random_state=random_state)
+                except:
+                    raise NanocomporeError("Error doing GMM test on reference {}".format(ref_id))
+                res["GMM_model"] = gmm_results['gmm']
+                if anova:
+                    res["GMM_anova_pvalue"] = gmm_results['anova']['pvalue']
+                    res["GMM_anova_model"] = gmm_results['anova']
+                    tests.add("GMM_anova_pvalue")
+                if logit:
+                    res["GMM_logit_pvalue"] = gmm_results['logit']['pvalue']
+                    res["GMM_logit_model"] = gmm_results['logit']
+                    tests.add("GMM_logit_pvalue")
+
+        # Calculate shift statistics
+        logger.trace(f"Calculatign shift stats for {pos}")
+        res['shift_stats'] = shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell)
+        # Save results in main
+        logger.trace(f"Saving test results for {pos}")
+        results[pos]["txComp"] = res
     logger.debug("Skipped {} positions because not present in all samples with sufficient coverage".format(n_lowcov))
 
     # Combine pvalue within a given sequence context
@@ -108,17 +105,17 @@ def txCompare(
         logger.debug ("Calculate weighs and cross correlation matrices by tests")
         if sequence_context_weights == "harmonic":
             # Generate weights as a symmetrical harmonic series
-            weights = harmomic_series(sequence_context)
+            weights = harmonic_series(sequence_context)
         else:
-            weights = [1]*(2*sequence_context+1)
+            weights = [1] * (2 * sequence_context + 1)
 
         # Collect pvalue lists per tests
         pval_list_dict = defaultdict(list)
-        for pos_dict in ref_pos_list:
-            if 'txComp' in pos_dict:
+        for res_dict in results.values():
+            if "txComp" in res_dict:
                 for test in tests:
-                    pval_list_dict[test].append(pos_dict['txComp'][test])
-            elif pos_dict["lowCov"]:
+                    pval_list_dict[test].append(res_dict["txComp"][test])
+            elif res_dict["lowCov"]:
                 for test in tests:
                     pval_list_dict[test].append(np.nan)
         # Compute cross correlation matrix per test
@@ -126,33 +123,38 @@ def txCompare(
         for test in tests:
             corr_matrix_dict[test] = cross_corr_matrix(pval_list_dict[test], sequence_context)
 
-        logger.debug("Combine adjacent position pvalues with Hou's method position per position")
-        # Iterate over each positions in previously generated result dictionary
-        for mid_pos in range(len(ref_pos_list)):
+        logger.debug("Combine adjacent position pvalues with Hou's method position by position")
+        # Iterate over each position in previously generated result dictionary
+        for mid_pos, res_dict in results.items():
             # Perform test only if middle pos is valid
-            if not ref_pos_list[mid_pos]["lowCov"]:
-                pval_list_dict = defaultdict(list)
-                for pos in range(mid_pos-sequence_context, mid_pos+sequence_context+1):
+            if res_dict["lowCov"]:
+                continue
+
+            pval_list_dict = defaultdict(list)
+            for pos in range(mid_pos - sequence_context, mid_pos + sequence_context + 1):
+                # If any of the positions is missing or lowCov, or any of the p-values in the context is NaN, consider it 1
+                if (pos not in results) or results[pos]["lowCov"]:
+                    for test in tests:
+                        pval_list_dict[test].append(1)
+                else:
                     for test in tests:
-                        # If any of the positions is missing or any of the pvalues in the context is lowCov or NaN, consider it 1
-                        if pos < 0 or pos >= len(ref_pos_list) or ref_pos_list[pos]["lowCov"] or np.isnan(ref_pos_list[pos]["txComp"][test]):
+                        if np.isnan(results[pos]["txComp"][test]):
                             pval_list_dict[test].append(1)
-                        # else just extract the corresponding pvalue
-                        else:
-                            pval_list_dict[test].append(ref_pos_list[pos]["txComp"][test])
-                # Combine collected pvalues and add to dict
-                for test in tests:
-                    test_label = "{}_context_{}".format(test, sequence_context)
-                    # If the mid p-value is.nan, force to nan also the context p-value
-                    if np.isnan(ref_pos_list[mid_pos]["txComp"][test]):
-                        ref_pos_list[mid_pos]['txComp'][test_label] = np.nan
-                    else:
-                        ref_pos_list[mid_pos]['txComp'][test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test])
+                        else: # just extract the corresponding pvalue
+                            pval_list_dict[test].append(results[pos]["txComp"][test])
+            # Combine collected pvalues and add to dict
+            for test in tests:
+                test_label = "{}_context_{}".format(test, sequence_context)
+                # If the mid p-value is NaN, also set the context p-value to NaN
+                if np.isnan(res_dict["txComp"][test]):
+                    res_dict["txComp"][test_label] = np.nan
+                else:
+                    res_dict["txComp"][test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test])
 
-    return ref_pos_list
+    return results
 
-def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=None):
 
+def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=None):
     if method in ["mann_whitney", "MW"]:
         stat_test = lambda x,y: mannwhitneyu(x, y, alternative='two-sided')
     elif method in ["kolmogorov_smirnov", "KS"]:
@@ -163,11 +165,11 @@ def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dw
         raise NanocomporeError("Invalid statistical method name (MW, KS, ttest)")
 
     pval_intensity = stat_test(condition1_intensity, condition2_intensity)[1]
-    if pval_intensity == 0: 
+    if pval_intensity == 0:
         pval_intensity = np.finfo(np.float).tiny
 
     pval_dwell = stat_test(condition1_dwell, condition2_dwell)[1]
-    if pval_dwell == 0: 
+    if pval_dwell == 0:
         pval_dwell = np.finfo(np.float).tiny
     return(pval_intensity, pval_dwell)
 
@@ -228,6 +230,7 @@ def gmm_test(data, random_state, anova=True, logit=False, verbose=True, allow_wa
 
     return({'anova':aov_results, 'logit': logit_results, 'gmm':{'model': gmm_mod, 'cluster_counts': cluster_counts}})
 
+
 def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied', 'diag', 'full']):
    # Loop over multiple cv_types and n_components and for each fit a GMM
     # calculate the BIC and retain the lowest
@@ -247,6 +250,7 @@ def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied
                 best_gmm_ncomponents = n_components
     return((best_gmm, best_gmm_type, best_gmm_ncomponents))
 
+
 def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings=False):
     labels= []
     logr = []
@@ -291,6 +295,7 @@ def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncom
     aov_results = {'pvalue': aov_pvalue, 'delta_logit': aov_delta_logit, 'table': aov_table, 'log_ratios':logr}
     return(aov_results)
 
+
 def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
     Y = [ sample_condition_labels[i] for i in Y]
     y_pred=np.append(y_pred, [0,0,1,1])
@@ -310,6 +315,7 @@ def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
     logit_results = {'pvalue': logit_pvalue, 'coef': logit_coef, 'model': logit_mod}
     return(logit_results)
 
+
 def count_reads_in_cluster(counters):
     cluster_counts = list()
     for k,v in counters.items():
@@ -317,6 +323,7 @@ def count_reads_in_cluster(counters):
     cluster_counts="__".join(cluster_counts)
     return(cluster_counts)
 
+
 def shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell):
     """Calculate shift statistics"""
     shift_stats = OrderedDict([
@@ -359,6 +366,7 @@ def cross_corr_matrix(pvalues_vector, context=2):
         matrix.append(row)
     return(np.array(matrix))
 
+
 def combine_pvalues_hou(pvalues, weights, cor_mat):
     """ Hou's method for the approximation for the distribution of the weighted
         combination of non-independent or independent probabilities.
@@ -406,15 +414,25 @@ def combine_pvalues_hou(pvalues, weights, cor_mat):
         combined_p_value = np.finfo(np.float).tiny
     return combined_p_value
 
-def harmomic_series(sequence_context):
+
+def harmonic_series(sequence_context):
     weights = []
     for i in range(-sequence_context, sequence_context+1):
         weights.append(1/(abs(i)+1))
     return weights
 
+
 def sum_of_squares(x):
     """
     Square each element of the input array and return the sum
     """
     x = np.atleast_1d(x)
     return np.sum(x*x)
+
+
+def has_low_coverage(pos_dict, min_coverage):
+    for cond_dict in pos_dict.values():
+        for sample_val in cond_dict.values():
+            if sample_val["coverage"] < min_coverage:
+                return True
+    return False

From 69563e002753ae759ffe4f2f86bd1cced8beeeae Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 29 Mar 2021 20:11:25 +0100
Subject: [PATCH 16/49] refactor DataStore, create child classes
 'DS_EventAlign' and 'DS_SampComp'

---
 nanocompore/DataStore.py | 213 +++++++++++++++++++++++++++------------
 1 file changed, 146 insertions(+), 67 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 1749ff7..fade41f 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -19,7 +19,64 @@ class DBCreateMode(Enum):
 
 
 class DataStore(object):
-    """Store Nanocompore data in an SQLite database"""
+    """Store Nanocompore data in an SQLite database - base class"""
+
+    create_tables_queries = [] # to be filled by derived classes
+
+    def __init__(self,
+                 db_path:str,
+                 create_mode=DBCreateMode.MUST_EXIST):
+        self._db_path = db_path
+        self._create_mode = create_mode
+        self._connection = None
+        self._cursor = None
+
+    def _init_db(self):
+        if self.create_tables_queries:
+            logger.debug("Setting up database tables")
+            try:
+                for query in create_tables_queries:
+                    self._cursor.execute(query)
+                self._connection.commit()
+            except:
+                logger.error("Error creating database tables")
+                raise
+
+    def __enter__(self):
+        if self._create_mode == DBCreateMode.MUST_EXIST and not os.path.exists(self._db_path):
+            raise NanocomporeError(f"Database file '{self._db_path}' does not exist")
+        if self._create_mode == DBCreateMode.OVERWRITE:
+            with contextlib.suppress(FileNotFoundError): # file may not exist
+                os.remove(self._db_path)
+                logger.debug(f"Removed existing database file '{self._db_path}'")
+        try:
+            logger.debug("Connecting to database")
+            self._connection = sqlite3.connect(self._db_path)
+            self._connection.row_factory = sqlite3.Row
+            self._cursor = self._connection.cursor()
+        except:
+            logger.error("Error connecting to database")
+            raise
+        if self._create_mode == DBCreateMode.OVERWRITE or \
+           (self._create_mode == DBCreateMode.CREATE_MAYBE and not os.path.exists(self._db_path)):
+            self._init_db()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self._connection:
+            logger.debug("Closing database connection")
+            self._connection.commit()
+            self._connection.close()
+            self._connection = None
+            self._cursor = None
+
+    @property
+    def cursor(self):
+        return self._cursor
+
+
+class DataStore_EventAlign(DataStore):
+    """Store Nanocompore data in an SQLite database - subclass for Eventalign_collapse results"""
 
     create_reads_query = ("CREATE TABLE IF NOT EXISTS reads ("
                           "id INTEGER NOT NULL PRIMARY KEY,"
@@ -73,53 +130,8 @@ class DataStore(object):
                                 ")"
                                 )
 
-    def __init__(self,
-                 db_path:str,
-                 create_mode=DBCreateMode.MUST_EXIST):
-        self.__db_path = db_path
-        self.__create_mode = create_mode
-        self.__connection = None
-        self.__cursor = None
-
-    def __enter__(self):
-        if self.__create_mode == DBCreateMode.MUST_EXIST and not os.path.exists(self.__db_path):
-            raise NanocomporeError(f"Database file '{self.__db_path}' does not exist")
-        if self.__create_mode == DBCreateMode.OVERWRITE:
-            with contextlib.suppress(FileNotFoundError): # file may not exist
-                os.remove(self.__db_path)
-                logger.debug(f"Removed existing database file '{self.__db_path}'")
-        try:
-            logger.debug("Connecting to database")
-            self.__connection = sqlite3.connect(self.__db_path)
-            self.__connection.row_factory = sqlite3.Row
-            self.__cursor = self.__connection.cursor()
-        except:
-            logger.error("Error connecting to database")
-            raise
-        if self.__create_mode == DBCreateMode.OVERWRITE or \
-           (self.__create_mode == DBCreateMode.CREATE_MAYBE and not os.path.exists(self.__db_path)):
-            self.__init_db()
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if self.__connection:
-            logger.debug("Closing database connection")
-            self.__connection.commit()
-            self.__connection.close()
-            self.__connection = None
-            self.__cursor = None
-
-    def __init_db(self):
-        logger.debug("Setting up database tables")
-        try:
-            self.__cursor.execute(self.create_reads_query)
-            self.__cursor.execute(self.create_kmers_query)
-            self.__cursor.execute(self.create_samples_query)
-            self.__cursor.execute(self.create_transcripts_query)
-            self.__connection.commit()
-        except:
-            logger.error("Error creating database tables")
-            raise
+    create_tables_queries = [create_reads_query, create_kmers_query,
+                             create_samples_query, create_transcripts_query]
 
     def store_read(self, read):
         """
@@ -134,16 +146,16 @@ def store_read(self, read):
         values = (read.read_id, sample_id, tx_id, read.ref_start, read.ref_end,
                   read.n_events, read.n_signals, read.dwell_time) + tuple(read.kmers_status.values())
         try:
-            self.__cursor.execute("INSERT INTO reads VALUES(NULL" + ", ?" * len(values) + ")",
+            self._cursor.execute("INSERT INTO reads VALUES(NULL" + ", ?" * len(values) + ")",
                                   values)
-            read_id = self.__cursor.lastrowid
+            read_id = self._cursor.lastrowid
         except Exception:
             logger.error("Error inserting read into database")
             raise Exception
 
         for kmer in read.kmer_l:
             self.__store_kmer(kmer=kmer, read_id=read_id)
-        self.__connection.commit()
+        self._connection.commit()
         # TODO check for success and return true/false
 
     def __store_kmer(self, kmer, read_id):
@@ -155,7 +167,7 @@ def __store_kmer(self, kmer, read_id):
         """
         res = kmer.get_results() # needed for 'median' and 'mad' values
         try:
-            self.__cursor.execute("INSERT INTO kmers VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+            self._cursor.execute("INSERT INTO kmers VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                               (read_id, res["ref_pos"], res["ref_kmer"], res["num_events"],
                                res["num_signals"], res["status"], res["dwell_time"],
                                res["NNNNN_dwell_time"], res["mismatch_dwell_time"], res["median"], res["mad"]))
@@ -175,16 +187,16 @@ def get_transcript_id_by_name(self, tx_name, create_if_not_exists=False):
                      ");"
                      )
             try:
-                self.__cursor.execute(query)
+                self._cursor.execute(query)
             except Exception:
                 logger.error("Error while inserting transcript into the database")
                 raise Exception
 
         query = f"SELECT id from transcripts WHERE name = '{tx_name}'"
         try:
-            self.__cursor.execute(query)
-            record = self.__cursor.fetchone()
-            self.__connection.commit()
+            self._cursor.execute(query)
+            record = self._cursor.fetchone()
+            self._connection.commit()
         except Exception:
             logger.error("Error while selecting transcript ID from the database")
             raise Exception
@@ -205,16 +217,16 @@ def get_sample_id_by_name(self, sample_name, create_if_not_exists=False):
                      ");"
                      )
             try:
-                self.__cursor.execute(query)
+                self._cursor.execute(query)
             except Exception:
                 logger.error("Error while inserting sample into the database")
                 raise Exception
 
         query = f"SELECT id from samples WHERE name = '{sample_name}'"
         try:
-            self.__cursor.execute(query)
-            record = self.__cursor.fetchone()
-            self.__connection.commit()
+            self._cursor.execute(query)
+            record = self._cursor.fetchone()
+            self._connection.commit()
         except Exception:
             logger.error("Error while selecting sample ID from the database")
             raise Exception
@@ -223,12 +235,8 @@ def get_sample_id_by_name(self, sample_name, create_if_not_exists=False):
         else:
             return None
 
-    @property
-    def cursor(self):
-        return self.__cursor
-
     def get_samples(self, sample_dict=None):
-        if not self.__connection:
+        if not self._connection:
             raise NanocomporeError("Database connection not yet opened")
         expected_samples = []
         if sample_dict: # query only relevant samples
@@ -241,8 +249,8 @@ def get_samples(self, sample_dict=None):
             where = ""
         db_samples = {}
         try:
-            self.__cursor.execute("SELECT * FROM samples" + where)
-            for row in self.__cursor:
+            self._cursor.execute("SELECT * FROM samples" + where)
+            for row in self._cursor:
                 db_samples[row["id"]] = row["name"]
         except Exception:
             logger.error("Error reading sample names from database")
@@ -251,3 +259,74 @@ def get_samples(self, sample_dict=None):
             if sample not in db_samples.values():
                 raise NanocomporeError(f"Sample '{sample}' not present in database")
         return db_samples
+
+
+class DataStore_SampComp(DataStore):
+    """Store Nanocompore data in an SQLite database - subclass for SampComp results"""
+
+    create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
+                                "id INTEGER NOT NULL PRIMARY KEY,"
+                                "name VARCHAR NOT NULL UNIQUE"
+                                ")"
+                                )
+
+    create_test_results_query = ("CREATE TABLE IF NOT EXISTS test_results ("
+                                 "id INTEGER NOT NULL PRIMARY KEY,"
+                                 "transcriptid VARCHAR NOT NULL,"
+                                 "kmer INTEGER NOT NULL,"
+                                 "KS_intensity_pvalue REAL,"
+                                 "KS_dwell_pvalue REAL,"
+                                 "GMM_n_components INTEGER,"
+                                 "GMM_cluster_counts VARCHAR,"
+                                 "GMM_logit_pvalue REAL,"
+                                 "GMM_logit_coef REAL,"
+                                 "c1_mean_intensity REAL,"
+                                 "c2_mean_intensity REAL,"
+                                 "c1_median_intensity REAL,"
+                                 "c2_median_intensity REAL,"
+                                 "c1_sd_intensity REAL,"
+                                 "c2_sd_intensity REAL,"
+                                 "c1_mean_dwell REAL,"
+                                 "c2_mean_dwell REAL,"
+                                 "c1_median_dwell REAL,"
+                                 "c2_median_dwell REAL,"
+                                 "c1_sd_dwell REAL,"
+                                 "c2_sd_dwell REAL,"
+                                 "UNIQUE (transcriptname, kmer)"
+                                 ")"
+                                 )
+
+    create_tables_queries = [create_transcripts_query, create_test_results_query]
+
+    def __insert_transcript_get_id(self, tx_name):
+        try:
+            self._cursor.execute("SELECT id FROM transcripts WHERE name = ?", tx_name)
+            if (row := self._cursor.fetchone()) is not None:
+                return row["id"]
+            self._cursor.execute("INSERT INTO transcripts VALUES (NULL, ?)", tx_name)
+            # TODO: if there could be multiple writing threads, "INSERT OR IGNORE"
+            # query should go before "SELECT"
+            return self._cursor.lastrowid
+        except:
+            logger.error(f"Failed to insert/look up transcript '{tx_name}'")
+            raise
+
+    def store_test_results(self, tx_name, test_results):
+        if not self._connection:
+            raise NanocomporeError("Database connection not yet opened")
+        tx_id = self._insert_transcript_get_id(tx_name)
+        for kmer, res in test_results.items():
+            values = [tx_id, kmer, res["KS_intensity_pvalue"], res["KS_dwell_pvalue"]]
+            if "GMM_model" in res:
+                values += [res["GMM_model"]["model"].n_components,
+                           res["GMM_model"]["cluster_counts"],
+                           res["GMM_logit_model"]["pvalue"],
+                           res["GMM_logit_model"]["coef"]]
+            else:
+                values += [None, None, None, None]
+            values.append(res["shift_stats"].values())
+            try:
+                self._cursor.execute("INSERT INTO test_results VALUES (NULL" + ", ?" * len(values) + ")", values)
+            except:
+                logger.error(f"Error storing test results for transcript '{tx_name}'")
+                raise

From b1ad89966d73f03e2dff16ca1a7cebeda9ee3250 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 29 Mar 2021 20:13:46 +0100
Subject: [PATCH 17/49] use 'DataStore_EventAlign' in 'Eventalign_collapse' and
 'Whitelist'

---
 nanocompore/Eventalign_collapse.py | 5 +++--
 nanocompore/Whitelist.py           | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index dd03b42..ebed305 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -19,7 +19,7 @@
 # Local imports
 from nanocompore.common import *
 from nanocompore.SuperParser import SuperParser
-from nanocompore.DataStore import DataStore, DBCreateMode
+from nanocompore.DataStore import DataStore_EventAlign, DBCreateMode
 
 # Disable multithreading for MKL and openBlas
 os.environ["MKL_NUM_THREADS"] = "1"
@@ -247,7 +247,8 @@ def __write_output_to_db(self, out_q, error_q):
         n_reads = 0
         db_path = os.path.join(self.__outpath, self.__outprefix+"_nanocompore.db")
         try:
-            with DataStore(self.__db_path, DBCreateMode.CREATE_MAYBE) as datastore, tqdm (unit=" reads") as pbar:
+            with DataStore_EventAlign(self.__db_path, DBCreateMode.CREATE_MAYBE) as datastore, \
+                 tqdm (unit=" reads") as pbar:
                 # Iterate over out queue until nthread poison pills are found
                 for _ in range (self.__nthreads):
                     for read in iter (out_q.get, None):
diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index a0b6183..70deb43 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -15,7 +15,7 @@
 
 # Local package
 from nanocompore.common import *
-from nanocompore.DataStore import DataStore
+from nanocompore.DataStore import DataStore_EventAlign
 
 # Set global random seed
 downsample_random_seed = 42
@@ -85,7 +85,7 @@ def __init__(self,
             raise NanocomporeError("The fasta file cannot be opened")
 
         # Database interaction
-        with DataStore(db_path) as db:
+        with DataStore_EventAlign(db_path) as db:
             db_samples = db.get_samples(sample_dict)
 
             # How many samples are in the DB? If we want all, we don't need a constraint below.

From b8de4e9214b9092880799201d82408877f042086 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 29 Mar 2021 20:16:17 +0100
Subject: [PATCH 18/49] TxComp: simplify 'txCompare' results data structure
 (remove 'lowCov' elements)

---
 nanocompore/TxComp.py | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index c58c56f..de2e38f 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -46,18 +46,17 @@ def txCompare(ref_id,
         anova = False
         logit = True
 
-    results = defaultdict(dict)
+    results = {}
     for pos, pos_dict in kmer_data.items():
         logger.trace(f"Processing position {pos}")
         # Filter out low coverage positions
-        results[pos]["lowCov"] = lowcov = has_low_coverage(pos_dict, min_coverage)
-        if lowcov:
+        if has_low_coverage(pos_dict, min_coverage):
             logger.trace(f"Position {pos} has low coverage, skipping")
             n_lowcov += 1
             continue
 
         # Perform stat tests
-        res = dict()
+        res = {}
         condition_labels = tuple(pos_dict.keys())
         if len(condition_labels) != 2:
             raise NanocomporeError("The %s method only supports two conditions" % method)
@@ -94,15 +93,15 @@ def txCompare(ref_id,
 
         # Calculate shift statistics
         logger.trace(f"Calculatign shift stats for {pos}")
-        res['shift_stats'] = shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell)
+        res["shift_stats"] = shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell)
         # Save results in main
         logger.trace(f"Saving test results for {pos}")
-        results[pos]["txComp"] = res
+        results[pos] = res
     logger.debug("Skipped {} positions because not present in all samples with sufficient coverage".format(n_lowcov))
 
     # Combine pvalue within a given sequence context
     if sequence_context > 0:
-        logger.debug ("Calculate weighs and cross correlation matrices by tests")
+        logger.debug ("Calculate weights and cross correlation matrices by tests")
         if sequence_context_weights == "harmonic":
             # Generate weights as a symmetrical harmonic series
             weights = harmonic_series(sequence_context)
@@ -112,12 +111,8 @@ def txCompare(ref_id,
         # Collect pvalue lists per tests
         pval_list_dict = defaultdict(list)
         for res_dict in results.values():
-            if "txComp" in res_dict:
-                for test in tests:
-                    pval_list_dict[test].append(res_dict["txComp"][test])
-            elif res_dict["lowCov"]:
-                for test in tests:
-                    pval_list_dict[test].append(np.nan)
+            for test in tests:
+                pval_list_dict[test].append(res_dict[test])
         # Compute cross correlation matrix per test
         corr_matrix_dict = OrderedDict()
         for test in tests:
@@ -126,30 +121,26 @@ def txCompare(ref_id,
         logger.debug("Combine adjacent position pvalues with Hou's method position by position")
         # Iterate over each position in previously generated result dictionary
         for mid_pos, res_dict in results.items():
-            # Perform test only if middle pos is valid
-            if res_dict["lowCov"]:
-                continue
-
             pval_list_dict = defaultdict(list)
             for pos in range(mid_pos - sequence_context, mid_pos + sequence_context + 1):
-                # If any of the positions is missing or lowCov, or any of the p-values in the context is NaN, consider it 1
-                if (pos not in results) or results[pos]["lowCov"]:
+                # If any of the positions is missing or any of the p-values in the context is NaN, consider it 1
+                if pos not in results:
                     for test in tests:
                         pval_list_dict[test].append(1)
                 else:
                     for test in tests:
-                        if np.isnan(results[pos]["txComp"][test]):
+                        if np.isnan(results[pos][test]):
                             pval_list_dict[test].append(1)
                         else: # just extract the corresponding pvalue
-                            pval_list_dict[test].append(results[pos]["txComp"][test])
+                            pval_list_dict[test].append(results[pos][test])
             # Combine collected pvalues and add to dict
             for test in tests:
                 test_label = "{}_context_{}".format(test, sequence_context)
                 # If the mid p-value is NaN, also set the context p-value to NaN
-                if np.isnan(res_dict["txComp"][test]):
-                    res_dict["txComp"][test_label] = np.nan
+                if np.isnan(res_dict[test]):
+                    res_dict[test_label] = np.nan
                 else:
-                    res_dict["txComp"][test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test])
+                    res_dict[test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test])
 
     return results
 

From 7fae6d859bbf85401f52e18fad6f8db3d74c4d59 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 29 Mar 2021 20:16:27 +0100
Subject: [PATCH 19/49] SampComp: small logging fix and update of 'txCompare'
 results

---
 nanocompore/SampComp.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index 41771d9..c889bea 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -11,7 +11,6 @@
 
 # Third party
 from loguru import logger
-import yaml
 from tqdm import tqdm
 import numpy as np
 from pyfaidx import Fasta
@@ -116,15 +115,6 @@ def __init__(self,
         # Save init options in dict for later
         log_init_state(loc=locals())
 
-        # TODO: remove this? (may be better handled in '__main__.py', if needed)
-        # If 'sample_dict' is not a dict try to load a YAML file instead
-        if type(sample_dict) == str:
-            logger.debug("Parsing YAML file")
-            if not access_file(sample_dict):
-                raise NanocomporeError("{} is not a valid file".format(sample_dict))
-            with open(sample_dict, "r") as fp:
-                sample_dict = yaml.load(fp, Loader=yaml.SafeLoader)
-
         # Check eventalign_dict file paths and labels
         check_sample_dict(sample_dict)
         logger.debug(sample_dict)
@@ -251,7 +241,7 @@ def __call__(self):
 
     def process_transcript(self, tx_id, whitelist_reads):
         """Process a transcript given filtered reads from Whitelist"""
-        logger.debug("Processing transcript: {tx_id}")
+        logger.debug(f"Processing transcript: {tx_id}")
 
         # Kmer data from whitelisted reads from all samples for this transcript
         # Structure: kmer position -> condition -> sample -> data
@@ -380,11 +370,10 @@ def __write_output(self, out_q, error_q):
                         logger.debug("Writer thread writing %s"%ref_id)
                         # Get pvalue fields available in analysed data
                         for res_dict in test_results.values():
-                            if "txComp" in res_dict:
-                                for res in res_dict["txComp"].keys():
-                                    if "pvalue" in res:
-                                        n_pos += 1
-                                        pvalue_tests.add(res)
+                            for res in res_dict.keys():
+                                if "pvalue" in res:
+                                    n_pos += 1
+                                    pvalue_tests.add(res)
                         # Write results in a shelve db
                         db[ref_id] = (kmer_data, test_results)
                         pbar.update(1)

From 9e96e8a9b44b0511cc47bcd21e787ac3ab39905e Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 31 Mar 2021 14:55:35 +0100
Subject: [PATCH 20/49] DataStore: add functions to store sample information
 and whitelisted reads

---
 nanocompore/DataStore.py | 65 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index fade41f..d426e06 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -5,6 +5,7 @@
 import os
 import sqlite3
 import contextlib
+from itertools import zip_longest
 
 # Third party
 from loguru import logger
@@ -119,10 +120,10 @@ class DataStore_EventAlign(DataStore):
 
     create_samples_query = ("CREATE TABLE IF NOT EXISTS samples ("
                             "id INTEGER NOT NULL PRIMARY KEY,"
-                            "name VARCHAR NOT NULL UNIQUE"
+                            "name VARCHAR NOT NULL UNIQUE,"
+                            "condition VARCHAR"
                             ")"
                             )
-    # TODO: add 'condition' column
 
     create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
                                 "id INTEGER NOT NULL PRIMARY KEY,"
@@ -260,6 +261,21 @@ def get_samples(self, sample_dict=None):
                 raise NanocomporeError(f"Sample '{sample}' not present in database")
         return db_samples
 
+    def store_sample_info(self, sample_dict):
+        if not self._connection:
+            raise NanocomporeError("Database connection not yet opened")
+        # query: insert sample; if it exists, update condition if that's missing
+        query = "INSERT INTO samples(id, name, condition) VALUES (NULL, ?, ?) " \
+            "ON CONFLICT(name) DO UPDATE SET condition = excluded.condition " \
+            "WHERE condition IS NULL"
+        for condition, samples in sample_dict.items():
+            try:
+                self._cursor.executemany(query, [(condition, sample) for sample in samples])
+            except:
+                logger.error(f"Error storing sample information for condition '{condition}'")
+                raise
+        self._connection.commit()
+
 
 class DataStore_SampComp(DataStore):
     """Store Nanocompore data in an SQLite database - subclass for SampComp results"""
@@ -270,12 +286,23 @@ class DataStore_SampComp(DataStore):
                                 ")"
                                 )
 
+    create_whitelist_query = ("CREATE TABLE IF NOT EXISTS whitelist ("
+                              "transcriptid INTEGER NOT NULL,"
+                              "readid INTEGER NOT NULL UNIQUE,"
+                              "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
+                              # "readid" is foreign key for "reads" table in EventAlign DB
+                              ")")
+
     create_test_results_query = ("CREATE TABLE IF NOT EXISTS test_results ("
-                                 "id INTEGER NOT NULL PRIMARY KEY,"
+                                 "id INTEGER NOT NULL PRIMARY KEY," # needed?
                                  "transcriptid VARCHAR NOT NULL,"
                                  "kmer INTEGER NOT NULL,"
+                                 "MW_intensity_pvalue REAL,"
+                                 "MW_dwell_pvalue REAL,"
                                  "KS_intensity_pvalue REAL,"
                                  "KS_dwell_pvalue REAL,"
+                                 "TT_intensity_pvalue REAL,"
+                                 "TT_dwell_pvalue REAL,"
                                  "GMM_n_components INTEGER,"
                                  "GMM_cluster_counts VARCHAR,"
                                  "GMM_logit_pvalue REAL,"
@@ -292,11 +319,14 @@ class DataStore_SampComp(DataStore):
                                  "c2_median_dwell REAL,"
                                  "c1_sd_dwell REAL,"
                                  "c2_sd_dwell REAL,"
-                                 "UNIQUE (transcriptname, kmer)"
+                                 "UNIQUE (transcriptid, kmer),"
+                                 "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
                                  ")"
                                  )
+    # TODO: store GMM cluster counts in a separate table (one row per sample)
 
-    create_tables_queries = [create_transcripts_query, create_test_results_query]
+    create_tables_queries = [create_transcripts_query, create_whitelist_query,
+                             create_test_results_query]
 
     def __insert_transcript_get_id(self, tx_name):
         try:
@@ -304,6 +334,7 @@ def __insert_transcript_get_id(self, tx_name):
             if (row := self._cursor.fetchone()) is not None:
                 return row["id"]
             self._cursor.execute("INSERT INTO transcripts VALUES (NULL, ?)", tx_name)
+            self._connection.commit()
             # TODO: if there could be multiple writing threads, "INSERT OR IGNORE"
             # query should go before "SELECT"
             return self._cursor.lastrowid
@@ -315,8 +346,12 @@ def store_test_results(self, tx_name, test_results):
         if not self._connection:
             raise NanocomporeError("Database connection not yet opened")
         tx_id = self._insert_transcript_get_id(tx_name)
+        univar_pvalues = [f"{t}_{m}_pvalue" for t in ["MW", "KS", "TT"]
+                          for m in ["intensity", "dwell"]]
         for kmer, res in test_results.items():
-            values = [tx_id, kmer, res["KS_intensity_pvalue"], res["KS_dwell_pvalue"]]
+            values = [tx_id, kmer]
+            for key in univar_pvalues:
+                values.append(res.get(key)) # appends 'None' if key doesn't exist
             if "GMM_model" in res:
                 values += [res["GMM_model"]["model"].n_components,
                            res["GMM_model"]["cluster_counts"],
@@ -330,3 +365,21 @@ def store_test_results(self, tx_name, test_results):
             except:
                 logger.error(f"Error storing test results for transcript '{tx_name}'")
                 raise
+        self._connection.commit()
+
+    def store_whitelist(self, whitelist):
+        if not self._connection:
+            raise NanocomporeError("Database connection not yet opened")
+        for tx_name, read_dict in whitelist:
+            try:
+                tx_id = self.__insert_transcript_get_id(tx_name)
+                for cond_reads in read_dict.values():
+                    for sample_reads in cond_reads.values():
+                        values = zip_longest([], sample_reads, fillvalue=tx_id)
+                        self._cursor.executemany("INSERT INTO whitelist VALUES (?, ?)", values)
+                        # TODO: store sample/condition information (again)?
+                        # it can be retrieved from "reads"/"samples" tables given "readid"
+                self._connection.commit()
+            except:
+                logger.error(f"Error storing whitelisted reads for transcript '{tx_name}'")
+                raise

From 63429d554bec5d335155631ffe0f6011ca8d4d3e Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 31 Mar 2021 14:56:47 +0100
Subject: [PATCH 21/49] SampComp: store whitelisted reads in SQLite DB

---
 nanocompore/SampComp.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index c889bea..10eae88 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -17,7 +17,7 @@
 
 # Local package
 from nanocompore.common import *
-from nanocompore.DataStore import DataStore
+from nanocompore.DataStore import *
 from nanocompore.Whitelist import Whitelist
 from nanocompore.TxComp import txCompare
 from nanocompore.SampCompDB import SampCompDB
@@ -37,10 +37,11 @@ class SampComp(object):
     #~~~~~~~~~~~~~~FUNDAMENTAL METHODS~~~~~~~~~~~~~~#
 
     def __init__(self,
-                 db_path:str,
+                 input_db_path:str,
                  sample_dict:dict,
                  fasta_fn:str,
                  bed_fn:str = None,
+                 output_db_path:str,
                  outpath:str = "results",
                  outprefix:str = "out",
                  overwrite:bool = False,
@@ -64,11 +65,17 @@ def __init__(self,
         Initialise a `SampComp` object and generates a white list of references with sufficient coverage for subsequent analysis.
         The retuned object can then be called to start the analysis.
         Args:
-        * db_path
+        * input_db_path
             Path to the SQLite database file with event-aligned read/kmer data
         * sample_dict
             Dictionary containing lists of (unique) sample names, grouped by condition
             example d = {"control": ["C1", "C2"], "treatment": ["T1", "T2"]}
+        * fasta_fn
+            Path to a fasta file corresponding to the reference used for read alignment.
+        * bed_fn
+            Path to a BED file containing the annotation of the transcriptome used as reference when mapping.
+        * output_db_path
+            Path to the SQLite database file for storing results
         * outpath
             Path to the output folder.
         * outprefix
@@ -76,14 +83,10 @@ def __init__(self,
         * overwrite
             If the output directory already exists, the standard behaviour is to raise an error to prevent overwriting existing data
             This option ignore the error and overwrite data if they have the same outpath and outprefix.
-        * fasta_fn
-            Path to a fasta file corresponding to the reference used for read alignment.
-        * bed_fn
-            Path to a BED file containing the annotation of the transcriptome used as reference when mapping.
         * whitelist
             Whitelist object previously generated with nanocompore Whitelist. If not given, will be automatically generated.
         * comparison_methods
-            Statistical method to compare the 2 samples (mann_whitney or MW, kolmogorov_smirnov or KS, t_test or TT, gaussian_mixture_model or GMM).
+            Statistical method to compare the two samples (mann_whitney or MW, kolmogorov_smirnov or KS, t_test or TT, gaussian_mixture_model or GMM).
             This can be a list or a comma separated string. {MW,KS,TT,GMM}
         * logit
             Force logistic regression even if we have less than 2 replicates in any condition.
@@ -147,7 +150,7 @@ def __init__(self,
                     raise NanocomporeError("Invalid comparison method {}".format(method))
 
         if not whitelist:
-            whitelist = Whitelist(db_path,
+            whitelist = Whitelist(input_db_path,
                                   sample_dict,
                                   fasta_fn,
                                   min_coverage = min_coverage,
@@ -158,6 +161,8 @@ def __init__(self,
                                   exclude_ref_id = exclude_ref_id)
         elif not isinstance(whitelist, Whitelist):
             raise NanocomporeError("Whitelist is not valid")
+        with DataStore_SampComp(output_db_path, DBCreateMode.CREATE_MAYBE) as db:
+            db.store_whitelist(whitelist)
 
         # Set private args from whitelist args
         self.__min_coverage = whitelist._Whitelist__min_coverage
@@ -165,7 +170,7 @@ def __init__(self,
         self.__max_invalid_kmers_freq = whitelist._Whitelist__max_invalid_kmers_freq
 
         # Save private args
-        self.__db_path = db_path
+        self.__input_db_path = input_db_path
         self.__sample_dict = sample_dict
         self.__fasta_fn = fasta_fn
         self.__bed_fn = bed_fn
@@ -257,7 +262,7 @@ def process_transcript(self, tx_id, whitelist_reads):
         n_reads = n_kmers = 0
 
         # Read kmer data from database
-        with DataStore(self.__db_path) as db:
+        with DataStore_EventAlign(self.__input_db_path) as db:
             for cond_lab, sample_dict in whitelist_reads.items():
                 for sample_id, read_ids in sample_dict.items():
                     if not read_ids: continue # TODO: error?

From edf0081d8d00de3dc9004c6c3ddfc633d37eaf52 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 6 Apr 2021 21:16:11 +0100
Subject: [PATCH 22/49] TxComp: use 'ST' as abbrev. for (Student) t-test

---
 nanocompore/TxComp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index de2e38f..2cd9929 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -67,7 +67,7 @@ def txCompare(ref_id,
 
         for met in methods:
             logger.trace(f"Running {met} test on position {pos}")
-            if met in ["MW", "KS", "TT"] :
+            if met in ["MW", "KS", "ST"] :
                 try:
                     pvalues = nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=met)
                 except:
@@ -150,10 +150,10 @@ def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dw
         stat_test = lambda x,y: mannwhitneyu(x, y, alternative='two-sided')
     elif method in ["kolmogorov_smirnov", "KS"]:
         stat_test = ks_twosamp
-    elif method in ["t_test", "TT"]:
+    elif method in ["student_t", "ST"]:
         stat_test = lambda x,y: ttest_ind(x, y, equal_var=False)
     else:
-        raise NanocomporeError("Invalid statistical method name (MW, KS, ttest)")
+        raise NanocomporeError("Invalid statistical method name (MW, KS, ST)")
 
     pval_intensity = stat_test(condition1_intensity, condition2_intensity)[1]
     if pval_intensity == 0:

From 536ab64b21f72385853efa1ca93d5ddbe46695ec Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 6 Apr 2021 21:18:04 +0100
Subject: [PATCH 23/49] DataStore: fix DB init (tables creation), rework DB
 schema for SampComp results

---
 nanocompore/DataStore.py | 130 +++++++++++++++++++++++----------------
 1 file changed, 77 insertions(+), 53 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index d426e06..4cb0f44 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -36,7 +36,7 @@ def _init_db(self):
         if self.create_tables_queries:
             logger.debug("Setting up database tables")
             try:
-                for query in create_tables_queries:
+                for query in self.create_tables_queries:
                     self._cursor.execute(query)
                 self._connection.commit()
             except:
@@ -44,12 +44,16 @@ def _init_db(self):
                 raise
 
     def __enter__(self):
+        init_db = False
         if self._create_mode == DBCreateMode.MUST_EXIST and not os.path.exists(self._db_path):
             raise NanocomporeError(f"Database file '{self._db_path}' does not exist")
         if self._create_mode == DBCreateMode.OVERWRITE:
             with contextlib.suppress(FileNotFoundError): # file may not exist
                 os.remove(self._db_path)
                 logger.debug(f"Removed existing database file '{self._db_path}'")
+            init_db = True
+        if self._create_mode == DBCreateMode.CREATE_MAYBE and not os.path.exists(self._db_path):
+            init_db = True
         try:
             logger.debug("Connecting to database")
             self._connection = sqlite3.connect(self._db_path)
@@ -58,8 +62,7 @@ def __enter__(self):
         except:
             logger.error("Error connecting to database")
             raise
-        if self._create_mode == DBCreateMode.OVERWRITE or \
-           (self._create_mode == DBCreateMode.CREATE_MAYBE and not os.path.exists(self._db_path)):
+        if init_db:
             self._init_db()
         return self
 
@@ -283,8 +286,7 @@ class DataStore_SampComp(DataStore):
     create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
                                 "id INTEGER NOT NULL PRIMARY KEY,"
                                 "name VARCHAR NOT NULL UNIQUE"
-                                ")"
-                                )
+                                ")")
 
     create_whitelist_query = ("CREATE TABLE IF NOT EXISTS whitelist ("
                               "transcriptid INTEGER NOT NULL,"
@@ -293,47 +295,57 @@ class DataStore_SampComp(DataStore):
                               # "readid" is foreign key for "reads" table in EventAlign DB
                               ")")
 
-    create_test_results_query = ("CREATE TABLE IF NOT EXISTS test_results ("
-                                 "id INTEGER NOT NULL PRIMARY KEY," # needed?
-                                 "transcriptid VARCHAR NOT NULL,"
-                                 "kmer INTEGER NOT NULL,"
-                                 "MW_intensity_pvalue REAL,"
-                                 "MW_dwell_pvalue REAL,"
-                                 "KS_intensity_pvalue REAL,"
-                                 "KS_dwell_pvalue REAL,"
-                                 "TT_intensity_pvalue REAL,"
-                                 "TT_dwell_pvalue REAL,"
-                                 "GMM_n_components INTEGER,"
-                                 "GMM_cluster_counts VARCHAR,"
-                                 "GMM_logit_pvalue REAL,"
-                                 "GMM_logit_coef REAL,"
-                                 "c1_mean_intensity REAL,"
-                                 "c2_mean_intensity REAL,"
-                                 "c1_median_intensity REAL,"
-                                 "c2_median_intensity REAL,"
-                                 "c1_sd_intensity REAL,"
-                                 "c2_sd_intensity REAL,"
-                                 "c1_mean_dwell REAL,"
-                                 "c2_mean_dwell REAL,"
-                                 "c1_median_dwell REAL,"
-                                 "c2_median_dwell REAL,"
-                                 "c1_sd_dwell REAL,"
-                                 "c2_sd_dwell REAL,"
-                                 "UNIQUE (transcriptid, kmer),"
-                                 "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
-                                 ")"
-                                 )
+    create_kmer_stats_query = ("CREATE TABLE IF NOT EXISTS kmer_stats ("
+                               "id INTEGER NOT NULL PRIMARY KEY,"
+                               "transcriptid INTEGER NOT NULL,"
+                               "kmer INTEGER NOT NULL,"
+                               "c1_mean_intensity REAL,"
+                               "c2_mean_intensity REAL,"
+                               "c1_median_intensity REAL,"
+                               "c2_median_intensity REAL,"
+                               "c1_sd_intensity REAL,"
+                               "c2_sd_intensity REAL,"
+                               "c1_mean_dwell REAL,"
+                               "c2_mean_dwell REAL,"
+                               "c1_median_dwell REAL,"
+                               "c2_median_dwell REAL,"
+                               "c1_sd_dwell REAL,"
+                               "c2_sd_dwell REAL,"
+                               "UNIQUE (transcriptid, kmer),"
+                               "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
+                               ")")
+    # TODO: are "c1" and "c2" (conditions) properly defined?
+
+    create_gmm_results_query = ("CREATE TABLE IF NOT EXISTS gmm_results ("
+                                "statsid INTEGER NOT NULL UNIQUE,"
+                                "n_components INTEGER,"
+                                "cluster_counts VARCHAR,"
+                                "logit_pvalue REAL,"
+                                "anova_pvalue REAL,"
+                                "FOREIGN KEY (statsid) REFERENCES kmer_stats(id)"
+                                ")")
     # TODO: store GMM cluster counts in a separate table (one row per sample)
+    # TODO: store additional data from logit/ANOVA tests?
+
+    create_univariate_results_query = ("CREATE TABLE IF NOT EXISTS univariate_results ("
+                                       "statsid INTEGER NOT NULL,"
+                                       "test VARCHAR NOT NULL CHECK (test in ('ST', 'MW', 'KS')),"
+                                       "intensity_pvalue REAL,"
+                                       "dwell_pvalue REAL,"
+                                       "UNIQUE (statsid, test),"
+                                       "FOREIGN KEY (statsid) REFERENCES kmer_stats(id)"
+                                       ")")
 
     create_tables_queries = [create_transcripts_query, create_whitelist_query,
-                             create_test_results_query]
+                             create_kmer_stats_query, create_gmm_results_query,
+                             create_univariate_results_query]
 
     def __insert_transcript_get_id(self, tx_name):
         try:
-            self._cursor.execute("SELECT id FROM transcripts WHERE name = ?", tx_name)
+            self._cursor.execute("SELECT id FROM transcripts WHERE name = ?", [tx_name])
             if (row := self._cursor.fetchone()) is not None:
                 return row["id"]
-            self._cursor.execute("INSERT INTO transcripts VALUES (NULL, ?)", tx_name)
+            self._cursor.execute("INSERT INTO transcripts VALUES (NULL, ?)", [tx_name])
             self._connection.commit()
             # TODO: if there could be multiple writing threads, "INSERT OR IGNORE"
             # query should go before "SELECT"
@@ -345,27 +357,39 @@ def __insert_transcript_get_id(self, tx_name):
     def store_test_results(self, tx_name, test_results):
         if not self._connection:
             raise NanocomporeError("Database connection not yet opened")
-        tx_id = self._insert_transcript_get_id(tx_name)
-        univar_pvalues = [f"{t}_{m}_pvalue" for t in ["MW", "KS", "TT"]
+        tx_id = self.__insert_transcript_get_id(tx_name)
+        univar_pvalues = [f"{t}_{m}_pvalue" for t in ["MW", "KS", "ST"]
                           for m in ["intensity", "dwell"]]
         for kmer, res in test_results.items():
             values = [tx_id, kmer]
-            for key in univar_pvalues:
-                values.append(res.get(key)) # appends 'None' if key doesn't exist
-            if "GMM_model" in res:
-                values += [res["GMM_model"]["model"].n_components,
-                           res["GMM_model"]["cluster_counts"],
-                           res["GMM_logit_model"]["pvalue"],
-                           res["GMM_logit_model"]["coef"]]
-            else:
-                values += [None, None, None, None]
-            values.append(res["shift_stats"].values())
+            values += res["shift_stats"].values()
             try:
-                self._cursor.execute("INSERT INTO test_results VALUES (NULL" + ", ?" * len(values) + ")", values)
+                self._cursor.execute("INSERT INTO kmer_stats VALUES (NULL" + ", ?" * len(values) + ")", values)
             except:
-                logger.error(f"Error storing test results for transcript '{tx_name}'")
+                logger.error(f"Error storing statistics for transcript '{tx_name}', kmer {kmer}")
                 raise
-        self._connection.commit()
+            statsid = self._cursor.lastrowid
+            for test in ["MW", "KS", "ST"]:
+                ipv = res.get(test + "_intensity_pvalue")
+                dpv = res.get(test + "_dwell_pvalue")
+                if (ipv is not None) or (dpv is not None): # can't use ':=' here because we need both values
+                    try:
+                        self._cursor.execute("INSERT INTO univariate_results VALUES (?, ?, ?, ?)",
+                                             (statsid, test, ipv, dpv))
+                    except:
+                        logger.error(f"Error storing {test} test results for transcript '{tx_name}', kmer {kmer}")
+                        raise
+            if "GMM_model" in res:
+                lpv = res.get("GMM_logit_pvalue")
+                apv = res.get("GMM_anova_pvalue")
+                try:
+                    self._cursor.execute("INSERT INTO gmm_results VALUES (?, ?, ?, ?, ?)",
+                                         (statsid, res["GMM_model"]["model"].n_components,
+                                          res["GMM_model"]["cluster_counts"], lpv, apv))
+                except:
+                    logger.error(f"Error storing GMM results for transcript '{tx_name}', kmer {kmer}")
+                    raise
+            self._connection.commit()
 
     def store_whitelist(self, whitelist):
         if not self._connection:

From 24881a4ff14e76c55a1a24b01028df8cdd5be439 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 6 Apr 2021 21:19:45 +0100
Subject: [PATCH 24/49] SampComp: remove unused parameters, write output to
 SQLite

---
 nanocompore/Eventalign_collapse.py |  2 +-
 nanocompore/SampComp.py            | 85 ++++++++++++++++--------------
 2 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index ebed305..7c6f6d6 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -267,7 +267,7 @@ def __write_output_to_db(self, out_q, error_q):
             pr.dump_stats("prof")
 
 
-    def __write_output (self, out_q, error_q):
+    def __write_output(self, out_q, error_q):
         """
         Mono-threaded Writer
         """
diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index 10eae88..7765765 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -38,12 +38,9 @@ class SampComp(object):
 
     def __init__(self,
                  input_db_path:str,
-                 sample_dict:dict,
-                 fasta_fn:str,
-                 bed_fn:str = None,
                  output_db_path:str,
-                 outpath:str = "results",
-                 outprefix:str = "out",
+                 sample_dict:dict,
+                 fasta_fn:str = "",
                  overwrite:bool = False,
                  whitelist:Whitelist = None,
                  comparison_methods:list = ["GMM", "KS"],
@@ -62,32 +59,28 @@ def __init__(self,
                  progress:bool = False):
 
         """
-        Initialise a `SampComp` object and generates a white list of references with sufficient coverage for subsequent analysis.
+        Initialise a `SampComp` object and generate a whitelist of references with sufficient coverage for subsequent analysis.
         The retuned object can then be called to start the analysis.
         Args:
         * input_db_path
             Path to the SQLite database file with event-aligned read/kmer data
+        * output_db_path
+            Path to the SQLite database file for storing results
         * sample_dict
             Dictionary containing lists of (unique) sample names, grouped by condition
-            example d = {"control": ["C1", "C2"], "treatment": ["T1", "T2"]}
+            Example: d = {"control": ["C1", "C2"], "treatment": ["T1", "T2"]}
         * fasta_fn
             Path to a fasta file corresponding to the reference used for read alignment.
-        * bed_fn
-            Path to a BED file containing the annotation of the transcriptome used as reference when mapping.
-        * output_db_path
-            Path to the SQLite database file for storing results
-        * outpath
-            Path to the output folder.
-        * outprefix
-            text outprefix for all the files generated by the function.
+            Not needed if 'whitelist' argument is provided.
         * overwrite
-            If the output directory already exists, the standard behaviour is to raise an error to prevent overwriting existing data
-            This option ignore the error and overwrite data if they have the same outpath and outprefix.
+            If the output database already exists, overwrite it with a new database?
+            By default, new data will be added to previous data.
         * whitelist
-            Whitelist object previously generated with nanocompore Whitelist. If not given, will be automatically generated.
+            Whitelist object previously generated with nanocompore Whitelist.
+            If not given, will be automatically generated.
         * comparison_methods
-            Statistical method to compare the two samples (mann_whitney or MW, kolmogorov_smirnov or KS, t_test or TT, gaussian_mixture_model or GMM).
-            This can be a list or a comma separated string. {MW,KS,TT,GMM}
+            Statistical method to compare the two samples (mann_whitney or MW, kolmogorov_smirnov or KS, student_t or ST, gaussian_mixture_model or GMM).
+            This can be a list or a comma separated string. {MW,KS,ST,GMM}
         * logit
             Force logistic regression even if we have less than 2 replicates in any condition.
         * allow_warnings
@@ -122,12 +115,6 @@ def __init__(self,
         check_sample_dict(sample_dict)
         logger.debug(sample_dict)
 
-        # Check if fasta and bed files exist
-        if not access_file(fasta_fn):
-            raise NanocomporeError("{} is not a valid FASTA file".format(fasta_fn))
-        if bed_fn and not access_file(bed_fn):
-            raise NanocomporeError("{} is not a valid BED file".format(bed_fn))
-
         # Check threads number
         if nthreads < 3:
             raise NanocomporeError("The minimum number of threads is 3")
@@ -139,13 +126,13 @@ def __init__(self,
             for i, method in enumerate(comparison_methods):
                 method = method.upper()
                 if method in ["MANN_WHITNEY", "MW"]:
-                    comparison_methods[i]="MW"
+                    comparison_methods[i] = "MW"
                 elif method in ["KOLMOGOROV_SMIRNOV", "KS"]:
-                    comparison_methods[i]="KS"
-                elif method in ["T_TEST", "TT"]:
-                    comparison_methods[i]="TT"
+                    comparison_methods[i] = "KS"
+                elif method in ["STUDENT_T", "ST"]:
+                    comparison_methods[i] = "ST"
                 elif method in ["GAUSSIAN_MIXTURE_MODEL", "GMM"]:
-                    comparison_methods[i]="GMM"
+                    comparison_methods[i] = "GMM"
                 else:
                     raise NanocomporeError("Invalid comparison method {}".format(method))
 
@@ -161,8 +148,12 @@ def __init__(self,
                                   exclude_ref_id = exclude_ref_id)
         elif not isinstance(whitelist, Whitelist):
             raise NanocomporeError("Whitelist is not valid")
-        with DataStore_SampComp(output_db_path, DBCreateMode.CREATE_MAYBE) as db:
+
+        self.__output_db_path = output_db_path
+        db_create_mode = DBCreateMode.OVERWRITE if overwrite else DBCreateMode.CREATE_MAYBE
+        with DataStore_SampComp(self.__output_db_path, db_create_mode) as db:
             db.store_whitelist(whitelist)
+        # TODO: move this to '__call__'?
 
         # Set private args from whitelist args
         self.__min_coverage = whitelist._Whitelist__min_coverage
@@ -173,8 +164,6 @@ def __init__(self,
         self.__input_db_path = input_db_path
         self.__sample_dict = sample_dict
         self.__fasta_fn = fasta_fn
-        self.__bed_fn = bed_fn
-        self.__db_fn = os.path.join(outpath, outprefix + "_SampComp.db")
         self.__whitelist = whitelist
         self.__comparison_methods = comparison_methods
         self.__logit = logit
@@ -206,7 +195,7 @@ def __call__(self):
         ps_list.append(mp.Process(target=self.__list_refid, args=(in_q, error_q)))
         for i in range(self.__nthreads):
             ps_list.append(mp.Process(target=self.__process_references, args=(in_q, out_q, error_q)))
-        ps_list.append(mp.Process(target=self.__write_output, args=(out_q, error_q)))
+        ps_list.append(mp.Process(target=self.__write_output_to_db, args=(out_q, error_q)))
 
         # Start processes and monitor error queue
         try:
@@ -225,12 +214,6 @@ def __call__(self):
             for q in (in_q, out_q, error_q):
                 q.close()
 
-            # Return database wrapper object
-            return SampCompDB(
-                db_fn=self.__db_fn,
-                fasta_fn=self.__fasta_fn,
-                bed_fn=self.__bed_fn)
-
         # Catch error, kill all processed and reraise error
         except Exception as E:
             logger.error("An error occured. Killing all processes and closing queues\n")
@@ -359,6 +342,26 @@ def __process_references(self, in_q, out_q, error_q):
             out_q.put(None)
 
 
+    def __write_output_to_db(self, out_q, error_q):
+        n_tx = 0
+        try:
+            # Database was already created earlier to store the whitelist!
+            with DataStore_SampComp(self.__output_db_path, DBCreateMode.MUST_EXIST) as db:
+                # Iterate over the counter queue and process items until all poison pills are found
+                for _ in range(self.__nthreads):
+                    for ref_id, kmer_data, test_results in iter(out_q.get, None):
+                        logger.debug("Writer thread storing transcript %s" % ref_id)
+                        db.store_test_results(ref_id, test_results)
+                        n_tx += 1
+        except Exception:
+            logger.error("Error in writer thread")
+            error_q.put(traceback.format_exc())
+        finally:
+            logger.info(f"All done. Transcripts processed: {n_tx}")
+            # Kill error queue with poison pill
+            error_q.put(None)
+
+
     def __write_output(self, out_q, error_q):
         # Get results out of the out queue and write in shelve
         pvalue_tests = set()

From fa0ec959a294e48f9c969619dd44ab86110d1380 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 16 Jul 2021 11:08:39 +0100
Subject: [PATCH 25/49] TxComp: coding style - added whitespace

---
 nanocompore/TxComp.py | 87 ++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index 2cd9929..4ac78af 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -205,21 +205,21 @@ def gmm_test(data, random_state, anova=True, logit=False, verbose=True, allow_wa
         if anova:
             aov_results = gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings)
         else:
-            aov_results=None
+            aov_results = None
 
         if logit:
             logit_results = gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels)
         else:
-            logit_results=None
+            logit_results = None
 
     elif gmm_ncomponents == 1:
-        aov_results = {'pvalue': np.nan, 'delta_logit': np.nan, 'table': "NC", 'cluster_counts': "NC"}
-        logit_results = {'pvalue': np.nan, 'coef': "NC", 'model': "NC"}
-        cluster_counts = "NC"
+        aov_results = {'pvalue': np.nan, 'delta_logit': np.nan, 'table': None, 'cluster_counts': None}
+        logit_results = {'pvalue': np.nan, 'coef': None, 'model': None}
+        cluster_counts = None
     else:
-        raise NanocomporeError("GMM models with n_component>2 are not supported")
+        raise NanocomporeError("GMM models with n_component > 2 are not supported")
 
-    return({'anova':aov_results, 'logit': logit_results, 'gmm':{'model': gmm_mod, 'cluster_counts': cluster_counts}})
+    return({'anova': aov_results, 'logit': logit_results, 'gmm': {'model': gmm_mod, 'cluster_counts': cluster_counts}})
 
 
 def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied', 'diag', 'full']):
@@ -243,23 +243,23 @@ def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied
 
 
 def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings=False):
-    labels= []
+    labels = []
     logr = []
-    for sample,counter in counters.items():
+    for sample, counter in counters.items():
         # Save the condition label the corresponds to the current sample
         labels.append(sample_condition_labels[sample])
         # The Counter dictionaries in counters are not ordered
         # The following line enforces the order and adds 1 to avoid empty clusters
-        ordered_counter = [ counter[i]+1 for i in range(gmm_ncomponents)]
+        ordered_counter = [counter[i] + 1 for i in range(gmm_ncomponents)]
         total = sum(ordered_counter)
-        normalised_ordered_counter = [ i/total for i in ordered_counter ]
+        normalised_ordered_counter = [i / total for i in ordered_counter]
         # Loop through ordered_counter and divide each value by the first
-        logr.append(np.log(normalised_ordered_counter[0]/(1-normalised_ordered_counter[0])))
+        logr.append(np.log(normalised_ordered_counter[0] / (1 - normalised_ordered_counter[0])))
     logr = np.around(np.array(logr), decimals=9)
-    logr_s1 = [logr[i] for i,l in enumerate(labels) if l==condition_labels[0]]
-    logr_s2 = [logr[i] for i,l in enumerate(labels) if l==condition_labels[1]]
+    logr_s1 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[0]]
+    logr_s2 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[1]]
     # If the SS for either array is 0, skip the anova test
-    if sum_of_squares(logr_s1-np.mean(logr_s1)) == 0 and sum_of_squares(logr_s2-np.mean(logr_s2)) == 0:
+    if sum_of_squares(logr_s1-np.mean(logr_s1)) == 0 and sum_of_squares(logr_s2 - np.mean(logr_s2)) == 0:
         if not allow_warnings:
             raise NanocomporeError("While doing the Anova test we found a sample with within variance = 0. Use --allow_warnings to ignore.")
         else:
@@ -282,7 +282,7 @@ def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncom
     if aov_pvalue == 0:
         raise NanocomporeError("The Anova test returned a p-value of 0. This is most likely an error somewhere")
     # Calculate the delta log odds ratio, i.e. the difference of the means of the log odds ratios between the two conditions
-    aov_delta_logit=float(np.mean(logr_s1)-np.mean(logr_s2))
+    aov_delta_logit = float(np.mean(logr_s1) - np.mean(logr_s2))
     aov_results = {'pvalue': aov_pvalue, 'delta_logit': aov_delta_logit, 'table': aov_table, 'log_ratios':logr}
     return(aov_results)
 
@@ -300,7 +300,7 @@ def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
             logit_mod=logit.fit(disp=0)
             logit_pvalue, logit_coef = logit_mod.pvalues[1], logit_mod.params[1]
         except ConvergenceWarning:
-            logit_mod, logit_pvalue, logit_coef = "NC", 1, "NC"
+            logit_mod, logit_pvalue, logit_coef = None, 1, None
     if logit_pvalue == 0:
         logit_pvalue = np.finfo(np.float).tiny
     logit_results = {'pvalue': logit_pvalue, 'coef': logit_coef, 'model': logit_mod}
@@ -338,22 +338,23 @@ def cross_corr_matrix(pvalues_vector, context=2):
     """ Calculate the cross correlation matrix of the
         pvalues for a given context.
     """
-    if len(pvalues_vector)<(context*3)+3:
+    if len(pvalues_vector) < (context * 3) + 3:
         raise NanocomporeError("Not enough p-values for a context of order %s"%context)
 
     pvalues_vector = np.array([ i if not np.isnan(i) else 1 for i in pvalues_vector ])
-    if any(pvalues_vector==0) or any(np.isinf(pvalues_vector)) or any(pvalues_vector>1):
+    if any(pvalues_vector == 0) or any(np.isinf(pvalues_vector)) or any(pvalues_vector > 1):
         raise NanocomporeError("At least one p-value is invalid")
 
-    matrix=[]
-    s=pvalues_vector.size
-    if all(p==1 for p in pvalues_vector):
-        return(np.ones((context*2+1, context*2+1)))
+    matrix = []
+    s = pvalues_vector.size
+    if all(p == 1 for p in pvalues_vector):
+        return(np.ones((context * 2 + 1, context * 2 + 1)))
 
-    for i in range(-context,context+1):
-        row=[]
-        for j in range(-context,context+1):
-            row.append(np.corrcoef((np.roll(pvalues_vector,i)[context:s-context]), (np.roll(pvalues_vector,j)[context:s-context]))[0][1])
+    for i in range(-context, context + 1):
+        row = []
+        for j in range(-context , context + 1):
+            row.append(np.corrcoef((np.roll(pvalues_vector, i)[context:s - context]),
+                                   (np.roll(pvalues_vector, j)[context:s - context]))[0][1])
         matrix.append(row)
     return(np.array(matrix))
 
@@ -374,32 +375,32 @@ def combine_pvalues_hou(pvalues, weights, cor_mat):
         raise NanocomporeError("Can't combine pvalues is pvalues and weights are not the same length.")
     if( cor_mat.shape[0] != cor_mat.shape[1] or cor_mat.shape[0] != len(pvalues)):
         raise NanocomporeError("The correlation matrix needs to be squared, with each dimension equal to the length of the pvalued vector.")
-    if all(p==1 for p in pvalues):
+    if all(p == 1 for p in pvalues):
         return 1
-    if any((p==0 or np.isinf(p) or p>1) for p in pvalues):
+    if any((p == 0 or np.isinf(p) or p > 1) for p in pvalues):
         raise NanocomporeError("At least one p-value is invalid")
 
     # Covariance estimation as in Kost and McDermott (eq:8)
     # https://doi.org/10.1016/S0167-7152(02)00310-3
     cov = lambda r: (3.263*r)+(0.710*r**2)+(0.027*r**3)
-    k=len(pvalues)
-    cov_sum=np.float64(0)
-    sw_sum=np.float64(0)
-    w_sum=np.float64(0)
-    tau=np.float64(0)
+    k = len(pvalues)
+    cov_sum = np.float64(0)
+    sw_sum = np.float64(0)
+    w_sum = np.float64(0)
+    tau = np.float64(0)
     for i in range(k):
-        for j in range(i+1,k):
-            cov_sum += weights[i]*weights[j]*cov(cor_mat[i][j])
+        for j in range(i + 1, k):
+            cov_sum += weights[i] * weights[j] * cov(cor_mat[i][j])
         sw_sum += weights[i]**2
         w_sum += weights[i]
         # Calculate the weighted Fisher's combination statistic
-        tau += weights[i] * (-2*np.log(pvalues[i]))
+        tau += weights[i] * (-2 * np.log(pvalues[i]))
     # Correction factor
-    c = (2*sw_sum+cov_sum) / (2*w_sum)
+    c = (2 * sw_sum + cov_sum) / (2 * w_sum)
     # Degrees of freedom
-    f = (4*w_sum**2) / (2*sw_sum+cov_sum)
+    f = (4 * w_sum**2) / (2 * sw_sum + cov_sum)
     # chi2.sf is the same as 1-chi2.cdf but is more accurate
-    combined_p_value = chi2.sf(tau/c,f)
+    combined_p_value = chi2.sf(tau/c, f)
     # Return a very small number if pvalue = 0
     if combined_p_value == 0:
         combined_p_value = np.finfo(np.float).tiny
@@ -408,8 +409,8 @@ def combine_pvalues_hou(pvalues, weights, cor_mat):
 
 def harmonic_series(sequence_context):
     weights = []
-    for i in range(-sequence_context, sequence_context+1):
-        weights.append(1/(abs(i)+1))
+    for i in range(-sequence_context, sequence_context + 1):
+        weights.append(1 / (abs(i) + 1))
     return weights
 
 
@@ -418,7 +419,7 @@ def sum_of_squares(x):
     Square each element of the input array and return the sum
     """
     x = np.atleast_1d(x)
-    return np.sum(x*x)
+    return np.sum(x * x)
 
 
 def has_low_coverage(pos_dict, min_coverage):

From 112e50db8c8549e6503dae936624aec53fd614ff Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 16 Jul 2021 11:12:13 +0100
Subject: [PATCH 26/49] DataStore: split 'gmm_results' SQL table into two;
 improve error logging during table creation

---
 nanocompore/DataStore.py | 83 +++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 27 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 4cb0f44..bfa6b69 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -22,7 +22,7 @@ class DBCreateMode(Enum):
 class DataStore(object):
     """Store Nanocompore data in an SQLite database - base class"""
 
-    create_tables_queries = [] # to be filled by derived classes
+    create_tables_queries = {} # table name -> SQL query (to be filled by derived classes)
 
     def __init__(self,
                  db_path:str,
@@ -36,11 +36,11 @@ def _init_db(self):
         if self.create_tables_queries:
             logger.debug("Setting up database tables")
             try:
-                for query in self.create_tables_queries:
+                for table, query in self.create_tables_queries.items():
                     self._cursor.execute(query)
                 self._connection.commit()
             except:
-                logger.error("Error creating database tables")
+                logger.error(f"Error creating database table '{table}'")
                 raise
 
     def __enter__(self):
@@ -134,8 +134,10 @@ class DataStore_EventAlign(DataStore):
                                 ")"
                                 )
 
-    create_tables_queries = [create_reads_query, create_kmers_query,
-                             create_samples_query, create_transcripts_query]
+    create_tables_queries = {"reads": create_reads_query,
+                             "kmers": create_kmers_query,
+                             "samples": create_samples_query,
+                             "transcripts": create_transcripts_query}
 
     def store_read(self, read):
         """
@@ -316,29 +318,41 @@ class DataStore_SampComp(DataStore):
                                ")")
     # TODO: are "c1" and "c2" (conditions) properly defined?
 
+    create_gmm_stats_query = ("CREATE TABLE IF NOT EXISTS gmm_stats ("
+                              "kmer_statsid INTEGER NOT NULL UNIQUE,"
+                              "covariance_type VARCHAR,"
+                              "n_components INTEGER,"
+                              "cluster_counts VARCHAR,"
+                              "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"
+                              ")")
+    # TODO: store GMM cluster counts in a separate table (one row per sample)
+    # TODO: if "covariance_type" is the same for all rows, store it in a "parameters" table?
+
+    # TODO: add column for adjusted p-values in tables below?
     create_gmm_results_query = ("CREATE TABLE IF NOT EXISTS gmm_results ("
-                                "statsid INTEGER NOT NULL UNIQUE,"
-                                "n_components INTEGER,"
-                                "cluster_counts VARCHAR,"
-                                "logit_pvalue REAL,"
-                                "anova_pvalue REAL,"
-                                "FOREIGN KEY (statsid) REFERENCES kmer_stats(id)"
+                                "gmm_statsid INTEGER NOT NULL UNIQUE,"
+                                "test VARCHAR NOT NULL CHECK (test in ('anova', 'logit')),"
+                                "test_pvalue REAL,"
+                                "test_stat REAL," # anova: delta logit, logit: LOR
+                                "UNIQUE (gmm_statsid, test),"
+                                "FOREIGN KEY (gmm_statsid) REFERENCES gmm_stats(id)"
                                 ")")
-    # TODO: store GMM cluster counts in a separate table (one row per sample)
-    # TODO: store additional data from logit/ANOVA tests?
 
     create_univariate_results_query = ("CREATE TABLE IF NOT EXISTS univariate_results ("
-                                       "statsid INTEGER NOT NULL,"
+                                       "kmer_statsid INTEGER NOT NULL,"
                                        "test VARCHAR NOT NULL CHECK (test in ('ST', 'MW', 'KS')),"
                                        "intensity_pvalue REAL,"
                                        "dwell_pvalue REAL,"
-                                       "UNIQUE (statsid, test),"
-                                       "FOREIGN KEY (statsid) REFERENCES kmer_stats(id)"
+                                       "UNIQUE (kmer_statsid, test),"
+                                       "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"
                                        ")")
 
-    create_tables_queries = [create_transcripts_query, create_whitelist_query,
-                             create_kmer_stats_query, create_gmm_results_query,
-                             create_univariate_results_query]
+    create_tables_queries = {"transcripts": create_transcripts_query,
+                             "whitelist": create_whitelist_query,
+                             "kmer_stats": create_kmer_stats_query,
+                             "gmm_stats": create_gmm_stats_query,
+                             "gmm_results": create_gmm_results_query,
+                             "univariate_results": create_univariate_results_query}
 
     def __insert_transcript_get_id(self, tx_name):
         try:
@@ -354,6 +368,7 @@ def __insert_transcript_get_id(self, tx_name):
             logger.error(f"Failed to insert/look up transcript '{tx_name}'")
             raise
 
+
     def store_test_results(self, tx_name, test_results):
         if not self._connection:
             raise NanocomporeError("Database connection not yet opened")
@@ -368,29 +383,43 @@ def store_test_results(self, tx_name, test_results):
             except:
                 logger.error(f"Error storing statistics for transcript '{tx_name}', kmer {kmer}")
                 raise
-            statsid = self._cursor.lastrowid
+            kmer_statsid = self._cursor.lastrowid
             for test in ["MW", "KS", "ST"]:
                 ipv = res.get(test + "_intensity_pvalue")
                 dpv = res.get(test + "_dwell_pvalue")
                 if (ipv is not None) or (dpv is not None): # can't use ':=' here because we need both values
                     try:
                         self._cursor.execute("INSERT INTO univariate_results VALUES (?, ?, ?, ?)",
-                                             (statsid, test, ipv, dpv))
+                                             (kmer_statsid, test, ipv, dpv))
                     except:
                         logger.error(f"Error storing {test} test results for transcript '{tx_name}', kmer {kmer}")
                         raise
             if "GMM_model" in res:
-                lpv = res.get("GMM_logit_pvalue")
-                apv = res.get("GMM_anova_pvalue")
                 try:
-                    self._cursor.execute("INSERT INTO gmm_results VALUES (?, ?, ?, ?, ?)",
-                                         (statsid, res["GMM_model"]["model"].n_components,
-                                          res["GMM_model"]["cluster_counts"], lpv, apv))
+                    self._cursor.execute("INSERT INTO gmm_stats VALUES (?, ?, ?, ?)",
+                                         (kmer_statsid,
+                                          res["GMM_model"]["model"].covariance_type,
+                                          res["GMM_model"]["model"].n_components,
+                                          res["GMM_model"]["cluster_counts"]))
                 except:
-                    logger.error(f"Error storing GMM results for transcript '{tx_name}', kmer {kmer}")
+                    logger.error(f"Error storing GMM stats for transcript '{tx_name}', kmer {kmer}")
                     raise
+                gmm_statsid = self._cursor.lastrowid
+                # store results of logit and/or ANOVA test on GMM components:
+                test_stats = {"logit": "coef", "anova": "delta_logit"}
+                for test, stat in test_stats.items():
+                    if f"GMM_{test}_pvalue" in res:
+                        try:
+                            self._cursor.execute("INSERT INTO gmm_results VALUES (?, ?, ?, ?)",
+                                                 (gmm_statsid, test,
+                                                  res[f"GMM_{test}_pvalue"],
+                                                  res[f"GMM_{test}_model"][stat]))
+                        except:
+                            logger.error(f"Error storing GMM {test} results for transcript '{tx_name}', kmer {kmer}")
+                            raise
             self._connection.commit()
 
+
     def store_whitelist(self, whitelist):
         if not self._connection:
             raise NanocomporeError("Database connection not yet opened")

From 6d00f2e48a703c10262c9d2d46983bcaeb6539c6 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 16 Jul 2021 11:12:59 +0100
Subject: [PATCH 27/49] add new class 'PostProcess' for data export etc. (work
 in progress)

---
 nanocompore/PostProcess.py | 280 +++++++++++++++++++++++++++++++++++++
 1 file changed, 280 insertions(+)
 create mode 100644 nanocompore/PostProcess.py

diff --git a/nanocompore/PostProcess.py b/nanocompore/PostProcess.py
new file mode 100644
index 0000000..92a73df
--- /dev/null
+++ b/nanocompore/PostProcess.py
@@ -0,0 +1,280 @@
+# -*- coding: utf-8 -*-
+
+#~~~~~~~~~~~~~~IMPORTS~~~~~~~~~~~~~~#
+# Std lib
+from loguru import logger
+
+# Third party
+from statsmodels.stats.multitest import multipletests
+
+
+#~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~#
+class PostProcess(object):
+    """Helper class for post-processing `SampComp` results"""
+
+    def __init___(self, sampcomp_db_path:str, eventalign_db_path:str, bed_path:str=None):
+        self._sampcomp_db_path = sampcomp_db_path
+        self._eventalign_db_path = eventalign_db_path
+        self._bed_path = bed_path
+
+
+    def save_all(self, outpath_prefix=None, pvalue_thr=0.01):
+        """
+        Save all text reports including genomic coordinate if a bed file was provided
+        * outpath_prefix
+            outpath + prefix to use as a basename for output files.
+            If not given, it will use the same prefix as the database.
+        * pvalue_thr
+            pvalue threshold to report significant sites in bed files
+        """
+        if not outpath_prefix:
+            outpath_prefix = self._db_path.replace("SampComp.db", "")
+        logger.debug("Save reports to {}".format(outpath_prefix))
+
+        # Save reports
+        logger.debug("\tSaving extended tabular report")
+        self.save_report(output_fn = outpath_prefix + "nanocompore_results.tsv")
+        logger.debug("\tSaving shift results")
+        self.save_shift_stats(output_fn = outpath_prefix + "nanocompore_shift_stats.tsv")
+
+        # Save bed and bedgraph files for each method used
+        if self._bed_path:
+            logger.debug("\tSaving significant genomic coordinates in Bed and Bedgraph format")
+            for m in self._metadata["pvalue_tests"]:
+                self.save_to_bed(
+                    output_fn = outpath_prefix+"sig_sites_{}_thr_{}.bed".format(m, pvalue_thr),
+                    bedgraph=False, pvalue_field=m, pvalue_thr=pvalue_thr, span=5, title="Nanocompore Significant Sites")
+                self.save_to_bed(
+                    output_fn = outpath_prefix+"sig_sites_{}_thr_{}.bedgraph".format(m, pvalue_thr),
+                    bedgraph=True, pvalue_field=m, pvalue_thr=pvalue_thr, title="Nanocompore Significant Sites")
+
+
+    def save_to_bed(self, output_fn=None, bedgraph=False, pvalue_field=None, pvalue_thr=0.01, span=5, convert=None, assembly=None, title=None):
+        """
+        Save the position of significant positions in the genome space in BED6 or BEDGRAPH format.
+        The resulting file can be used in a genome browser to visualise significant genomic locations.
+        The option is only available if `SampCompDB` if initialised with a BED file containing genome annotations.
+        * output_fn
+            Path to file where to write the data
+        * bedgraph
+            save file in bedgraph format instead of bed
+        * pvalue_field
+            specifies what column to use as BED score (field 5, as -log10)
+        * pvalue_thr
+            only report positions with pvalue<=thr
+        * span
+            The size of each BED feature.
+            If size=5 (default) features correspond to kmers.
+            If size=1 features correspond to the first base of each kmer.
+        * convert
+            one of 'ensembl_to_ucsc' or 'ucsc_to_ensembl". Convert chromosome named between Ensembl and Ucsc conventions
+        * assembly
+            required if convert is used. One of "hg38" or "mm10"
+        """
+        if self._bed_path is None:
+            raise NanocomporeError("In order to generate a BED file PostProcess needs to be initialised with a transcriptome BED")
+        if span < 1:
+            raise NanocomporeError("span has to be >=1")
+        if span != 5 and bedgraph:
+            raise NanocomporeError("Span is ignored when generating bedGraph files")
+        if pvalue_field not in self.results:
+            raise NanocomporeError(("The field '%s' is not in the results" % pvalue_field))
+        if "results" not in self.__dict__:
+            raise NanocomporeError("It looks like there's not results slot in SampCompDB")
+        if convert not in [None, "ensembl_to_ucsc", "ucsc_to_ensembl"]:
+            raise NanocomporeError("Convert value not valid")
+        if convert is not None and assembly is None:
+            raise NanocomporeError("The assembly argument is required in order to do the conversion. Choose one of 'hg38' or 'mm10' ")
+
+        with open(output_fn, "w") as bed_file:
+            if title is not None:
+                if not bedgraph:
+                    bed_file.write('track type=bed name="%s" description="%s"\n' % (title, pvalue_field))
+                else:
+                    bed_file.write('track type=bedGraph name="%s" description="%s"\n' % (title, pvalue_field))
+
+            Record = namedtuple('Record', ['chr', 'genomicPos', 'ref_id', 'strand', 'ref_kmer', pvalue_field])
+            threshold = -log(pvalue_thr, 10)
+            for record in self.results[list(Record._fields)].itertuples(index=False, name="Record"):
+                pvalue = getattr(record, pvalue_field)
+                if np.isnan(pvalue):
+                    pvalue = 0
+                elif pvalue < sys.float_info.min:
+                    pvalue = -log(sys.float_info.min, 10)
+                else:
+                    pvalue = -log(pvalue, 10)
+                if not bedgraph and pvalue < threshold:
+                    continue
+                if bedgraph:
+                    if record.strand == "+":
+                        start_pos = record.genomicPos + 2
+                    else:
+                        start_pos = record.genomicPos - 2
+                    end_pos = start_pos + 1
+                else:
+                    if record.strand == "+":
+                        start_pos = record.genomicPos
+                    else:
+                        start_pos = record.genomicPos - span + 1
+                    end_pos = start_pos + span
+                line = bedline([record.chr, start_pos, end_pos, "%s_%s" % (record.ref_id, record.ref_kmer),
+                                pvalue, record.strand])
+                if convert == "ensembl_to_ucsc":
+                    line = line.translateChr(assembly=assembly, target="ucsc", patches=True)
+                elif convert == "ucsc_to_ensembl":
+                    line = line.translateChr(assembly=assembly, target="ens", patches=True)
+                if bedgraph:
+                    bed_file.write("%s\t%s\t%s\t%s\n" % (line.chr, line.start, line.end, line.score))
+                else:
+                    bed_file.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (line.chr, line.start, line.end,
+                                                                 line.name, line.score, line.strand))
+
+
+    def save_report(self, output_fn:str=None, include_shift_stats:bool=True):
+        """
+        Saves a tabulated text dump of the database containing all the statistical results for all the positions
+        * output_fn
+            Path to file where to write the data. If None, data is returned to the standard output.
+        """
+        ## TODO: can this be done in a "with ..." clause?
+        if output_fn is None:
+            fp = sys.stdout
+        elif isinstance(output_fn, str):
+            try:
+                fp = open(output_fn, "w")
+            except:
+                raise NanocomporeError("Error opening output file %s" % output_fn)
+        else:
+            raise NanocomporeError("output_fn needs to be a string or None")
+
+        shift_stat_columns = []
+        if include_shift_stats:
+            shift_stat_columns = ["c1_mean_intensity", "c2_mean_intensity",
+                                  "c1_median_intensity", "c2_median_intensity",
+                                  "c1_sd_intensity", "c2_sd_intensity",
+                                  "c1_mean_dwell", "c2_mean_dwell",
+                                  "c1_median_dwell", "c2_median_dwell",
+                                  "c1_sd_dwell", "c2_sd_dwell"]
+
+        with DataStore_SampComp(self._sampcomp_db_path) as sc_db, \
+             DataStore_EventAlign(self._eventalign_db_path) as ea_db:
+            # Which statistical tests were performed?
+            query = "SELECT DISTINCT test FROM univariate_results"
+            univar_tests = [row["test"] for row in sc_db.cursor.execute(query)]
+            query = "SELECT DISTINCT test FROM gmm_results"
+            gmm_tests = [row["test"] for row in sc_db.cursor.execute(query)]
+            # Generate headers
+            headers = ['pos', 'chr', 'genomicPos', 'ref_id', 'strand', 'ref_kmer']
+            for test in sorted(univar_tests):
+                headers += [f"{test}_dwell_pvalue", f"{test}_intensity_pvalue"]
+            if gmm_tests:
+                # TODO: what if GMM was fitted, but no test were performed?
+                headers += ["GMM_cov_type", "GMM_n_clust", "cluster_counts"]
+                if "logit" in gmm_tests:
+                    headers += ["GMM_logit_pvalue", "Logit_LOR"]
+                if "anova" in gmm_tests:
+                    headers += ["GMM_anova_pvalue", "Anova_delta_logit"]
+            # Write headers to file
+            fp.write('\t'.join([str(i) for i in headers]) + '\n')
+
+            # Merge kmer information with transcript name:
+            columns = ["kmer_stats.id", "transcriptid", "kmer AS pos", "name AS ref_id"] + shift_stat_columns
+            columns = ", ".join(columns)
+            query = f"SELECT {columns} FROM kmer_stats LEFT JOIN transcripts ON transcriptid = transcripts.id ORDER BY transcriptid, kmer"
+            for row in sc_db.cursor.execute(query):
+                db_data = dict(row)
+                # Get p-values etc.:
+                id = db_data["id"]
+                if univar_tests:
+                    query = f"SELECT test, intensity_pvalue, dwell_pvalue FROM univariate_results WHERE kmer_statsid = {id}"
+                    for row2 in sc_db.cursor.execute(query):
+                        test = row2["test"]
+                        db_data[test + "_intensity_pvalue"] = row2["intensity_pvalue"]
+                        db_data[test + "_dwell_pvalue"] = row2["dwell_pvalue"]
+                if gmm_tests:
+                    query = f"SELECT test, test_pvalue, test_stat FROM gmm_results WHERE gmm_statsid = {id}"
+                    for row2 in sc_db.cursor.execute(query):
+                        test = row2["test"]
+                        db_data[test + "_intensity_pvalue"] = row2["intensity_pvalue"]
+                        db_data[test + "_dwell_pvalue"] = row2["dwell_pvalue"]
+
+
+
+            # TODO: where does chromosome and genomic pos. information come from?
+
+
+        # We loop over the IDs so that ref_pos_list can be prefetched for each transcript
+        for cur_id in self.ref_id_list:
+            cur_ref_pos_list = self[cur_id]
+            for record in self.results[self.results.ref_id == cur_id ].itertuples():
+                if "GMM" in self._metadata["comparison_methods"]:
+                    record_txComp = cur_ref_pos_list[record.pos]['txComp']
+                line = []
+                for f in headers:
+                    if f in record._fields:
+                        line.append(getattr(record, f))
+                    elif f == "GMM_cov_type":
+                        line.append(record_txComp['GMM_model']['model'].covariance_type)
+                    elif f == "GMM_n_clust":
+                        line.append(record_txComp['GMM_model']['model'].n_components)
+                    elif f == "cluster_counts":
+                        line.append(record_txComp['GMM_model']['cluster_counts'])
+                    elif f == "Anova_delta_logit":
+                        line.append(record_txComp['GMM_anova_model']['delta_logit'])
+                    elif f == "Logit_LOR":
+                        line.append(record_txComp['GMM_logit_model']['coef'])
+                    else: line.append("NA")
+                fp.write('\t'.join([ str(i) for i in line ])+'\n')
+        fp.close()
+
+
+    def save_shift_stats(self, output_fn=None):
+        """
+        Save the mean, median and sd intensity and dwell time for each condition and for each position.
+        This can be used to evaluate the intensity of the shift for significant positions.
+        * output_fn
+            Path to file where to write the data. If None, data is returned to the standard output.
+        """
+        if output_fn is None:
+            fp = sys.stdout
+        elif isinstance(output_fn, str):
+            try:
+                fp = open(output_fn, "w")
+            except:
+                raise NanocomporeError("Error opening output file %s" % output_fn)
+        else:
+            raise NanocomporeError("output_fn needs to be a string or None")
+
+        headers = ['c1_mean_intensity', 'c2_mean_intensity', 'c1_median_intensity', 'c2_median_intensity', 'c1_sd_intensity', 'c2_sd_intensity', 'c1_mean_dwell', 'c2_mean_dwell', 'c1_median_dwell', 'c2_median_dwell', 'c1_sd_dwell', 'c2_sd_dwell']
+        fp.write('\t'.join([ str(i) for i in ["ref_id", "pos"]+headers ])+'\n')
+        for tx, refpos in self:
+            for pos, refpos_list in enumerate(refpos):
+                if "txComp" in refpos_list:
+                    ss = refpos_list['txComp']['shift_stats']
+                    if list(ss.keys()) != headers:
+                        raise NanocomporeError("Mismatch in shift_stats headers")
+                    line = [tx, pos, *ss.values()]
+                    fp.write('\t'.join([ str(i) for i in line ])+'\n')
+        fp.close()
+
+
+    @staticmethod
+    def __multipletests_filter_nan(pvalues, method="fdr_bh"):
+        """
+        Performs p-value correction for multiple hypothesis testing
+        using the method specified. The pvalues list can contain
+        np.nan values, which are ignored during p-value correction.
+        test: input=[0.1, 0.01, np.nan, 0.01, 0.5, 0.4, 0.01, 0.001, np.nan, np.nan, 0.01, np.nan]
+        out: array([0.13333333, 0.016     ,        nan, 0.016     , 0.5       ,
+        0.45714286, 0.016     , 0.008     ,        nan,        nan,
+        0.016     ,        nan])
+        """
+        if all([np.isnan(p) for p in pvalues]):
+            return pvalues
+
+        pvalues_no_nan = [p for p in pvalues if not np.isnan(p)]
+        corrected_p_values = multipletests(pvalues_no_nan, method=method)[1]
+        for i, p in enumerate(pvalues):
+            if np.isnan(p):
+                corrected_p_values = np.insert(corrected_p_values, i, np.nan, axis=0)
+        return(corrected_p_values)

From d8324c77b76dda42c9f12e0e42241a7d55c73f6a Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 11:35:40 +0100
Subject: [PATCH 28/49] SampComp/TxComp/DataStore: limit to one univariate and
 one GMM-based statistical test per k-mer

---
 nanocompore/DataStore.py |  79 ++++----------
 nanocompore/SampComp.py  |  57 +++++-----
 nanocompore/TxComp.py    | 225 ++++++++++++++++++---------------------
 3 files changed, 152 insertions(+), 209 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index bfa6b69..5dae718 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -285,6 +285,13 @@ def store_sample_info(self, sample_dict):
 class DataStore_SampComp(DataStore):
     """Store Nanocompore data in an SQLite database - subclass for SampComp results"""
 
+    # TODO: add more parameters
+    create_parameters_query = ("CREATA TABLE IF NOT EXISTS parameters ("
+                               "univariate_test VARCHAR CHECK (univariate_test in ('ST', 'MW', 'KS')),"
+                               "gmm_covariance_type VARCHAR,"
+                               "gmm_test VARCHAR CHECK (gmm_test in ('anova', 'logit'))"
+                               ")")
+
     create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
                                 "id INTEGER NOT NULL PRIMARY KEY,"
                                 "name VARCHAR NOT NULL UNIQUE"
@@ -297,6 +304,7 @@ class DataStore_SampComp(DataStore):
                               # "readid" is foreign key for "reads" table in EventAlign DB
                               ")")
 
+    # TODO: add columns for adjusted p-values in tables below?
     create_kmer_stats_query = ("CREATE TABLE IF NOT EXISTS kmer_stats ("
                                "id INTEGER NOT NULL PRIMARY KEY,"
                                "transcriptid INTEGER NOT NULL,"
@@ -313,6 +321,8 @@ class DataStore_SampComp(DataStore):
                                "c2_median_dwell REAL,"
                                "c1_sd_dwell REAL,"
                                "c2_sd_dwell REAL,"
+                               "intensity_pvalue REAL,"
+                               "dwell_pvalue REAL,"
                                "UNIQUE (transcriptid, kmer),"
                                "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
                                ")")
@@ -320,39 +330,18 @@ class DataStore_SampComp(DataStore):
 
     create_gmm_stats_query = ("CREATE TABLE IF NOT EXISTS gmm_stats ("
                               "kmer_statsid INTEGER NOT NULL UNIQUE,"
-                              "covariance_type VARCHAR,"
-                              "n_components INTEGER,"
+                              "n_components INTEGER NOT NULL,"
                               "cluster_counts VARCHAR,"
+                              "test_stat REAL,"
+                              "test_pvalue REAL,"
                               "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"
                               ")")
-    # TODO: store GMM cluster counts in a separate table (one row per sample)
-    # TODO: if "covariance_type" is the same for all rows, store it in a "parameters" table?
-
-    # TODO: add column for adjusted p-values in tables below?
-    create_gmm_results_query = ("CREATE TABLE IF NOT EXISTS gmm_results ("
-                                "gmm_statsid INTEGER NOT NULL UNIQUE,"
-                                "test VARCHAR NOT NULL CHECK (test in ('anova', 'logit')),"
-                                "test_pvalue REAL,"
-                                "test_stat REAL," # anova: delta logit, logit: LOR
-                                "UNIQUE (gmm_statsid, test),"
-                                "FOREIGN KEY (gmm_statsid) REFERENCES gmm_stats(id)"
-                                ")")
-
-    create_univariate_results_query = ("CREATE TABLE IF NOT EXISTS univariate_results ("
-                                       "kmer_statsid INTEGER NOT NULL,"
-                                       "test VARCHAR NOT NULL CHECK (test in ('ST', 'MW', 'KS')),"
-                                       "intensity_pvalue REAL,"
-                                       "dwell_pvalue REAL,"
-                                       "UNIQUE (kmer_statsid, test),"
-                                       "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"
-                                       ")")
 
-    create_tables_queries = {"transcripts": create_transcripts_query,
+    create_tables_queries = {"parameters": create_parameters_query,
+                             "transcripts": create_transcripts_query,
                              "whitelist": create_whitelist_query,
                              "kmer_stats": create_kmer_stats_query,
-                             "gmm_stats": create_gmm_stats_query,
-                             "gmm_results": create_gmm_results_query,
-                             "univariate_results": create_univariate_results_query}
+                             "gmm_stats": create_gmm_stats_query}
 
     def __insert_transcript_get_id(self, tx_name):
         try:
@@ -373,50 +362,28 @@ def store_test_results(self, tx_name, test_results):
         if not self._connection:
             raise NanocomporeError("Database connection not yet opened")
         tx_id = self.__insert_transcript_get_id(tx_name)
-        univar_pvalues = [f"{t}_{m}_pvalue" for t in ["MW", "KS", "ST"]
-                          for m in ["intensity", "dwell"]]
         for kmer, res in test_results.items():
             values = [tx_id, kmer]
             values += res["shift_stats"].values()
+            values.append(res.get("intensity_pvalue"))
+            values.append(res.get("dwell_pvalue"))
             try:
                 self._cursor.execute("INSERT INTO kmer_stats VALUES (NULL" + ", ?" * len(values) + ")", values)
             except:
                 logger.error(f"Error storing statistics for transcript '{tx_name}', kmer {kmer}")
                 raise
             kmer_statsid = self._cursor.lastrowid
-            for test in ["MW", "KS", "ST"]:
-                ipv = res.get(test + "_intensity_pvalue")
-                dpv = res.get(test + "_dwell_pvalue")
-                if (ipv is not None) or (dpv is not None): # can't use ':=' here because we need both values
-                    try:
-                        self._cursor.execute("INSERT INTO univariate_results VALUES (?, ?, ?, ?)",
-                                             (kmer_statsid, test, ipv, dpv))
-                    except:
-                        logger.error(f"Error storing {test} test results for transcript '{tx_name}', kmer {kmer}")
-                        raise
-            if "GMM_model" in res:
+            if "gmm_model" in res:
                 try:
                     self._cursor.execute("INSERT INTO gmm_stats VALUES (?, ?, ?, ?)",
                                          (kmer_statsid,
-                                          res["GMM_model"]["model"].covariance_type,
-                                          res["GMM_model"]["model"].n_components,
-                                          res["GMM_model"]["cluster_counts"]))
+                                          res["gmm_model"].n_components,
+                                          res["gmm_cluster_counts"],
+                                          res["gmm_test_stat"],
+                                          res["gmm_pvalue"]))
                 except:
                     logger.error(f"Error storing GMM stats for transcript '{tx_name}', kmer {kmer}")
                     raise
-                gmm_statsid = self._cursor.lastrowid
-                # store results of logit and/or ANOVA test on GMM components:
-                test_stats = {"logit": "coef", "anova": "delta_logit"}
-                for test, stat in test_stats.items():
-                    if f"GMM_{test}_pvalue" in res:
-                        try:
-                            self._cursor.execute("INSERT INTO gmm_results VALUES (?, ?, ?, ?)",
-                                                 (gmm_statsid, test,
-                                                  res[f"GMM_{test}_pvalue"],
-                                                  res[f"GMM_{test}_model"][stat]))
-                        except:
-                            logger.error(f"Error storing GMM {test} results for transcript '{tx_name}', kmer {kmer}")
-                            raise
             self._connection.commit()
 
 
diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index 7765765..e67d6f4 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -32,10 +32,11 @@
 
 #~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~#
 class SampComp(object):
-    """ Init analysis and check args"""
+    """Init analysis and check args"""
 
     #~~~~~~~~~~~~~~FUNDAMENTAL METHODS~~~~~~~~~~~~~~#
 
+    # TODO: use enums for univariate and gmm test parameters?
     def __init__(self,
                  input_db_path:str,
                  output_db_path:str,
@@ -43,9 +44,9 @@ def __init__(self,
                  fasta_fn:str = "",
                  overwrite:bool = False,
                  whitelist:Whitelist = None,
-                 comparison_methods:list = ["GMM", "KS"],
-                 logit:bool = True,
-                 anova:bool = False,
+                 univariate_test:str = "KS", # or: "MW", "ST"
+                 fit_gmm:bool = True,
+                 gmm_test:str = "logit", # or: "anova"
                  allow_warnings:bool = False,
                  sequence_context:int = 0,
                  sequence_context_weights:str = "uniform",
@@ -78,15 +79,16 @@ def __init__(self,
         * whitelist
             Whitelist object previously generated with nanocompore Whitelist.
             If not given, will be automatically generated.
-        * comparison_methods
-            Statistical method to compare the two samples (mann_whitney or MW, kolmogorov_smirnov or KS, student_t or ST, gaussian_mixture_model or GMM).
-            This can be a list or a comma separated string. {MW,KS,ST,GMM}
-        * logit
-            Force logistic regression even if we have less than 2 replicates in any condition.
+        * univariate_test
+            Statistical test to compare the two samples ('MW' for Mann-Whitney, 'KS' for Kolmogorov-Smirnov or 'ST' for Student's t), or empty for no test.
+        * fit_gmm
+            Fit a Gaussian mixture model (GMM) to the intensity/dwell-time distribution?
+        * gmm_test
+            Method to compare samples based on the GMM ('logit' or 'anova'), or empty for no comparison.
         * allow_warnings
             If True runtime warnings during the ANOVA tests don't raise an error.
         * sequence_context
-            Extend statistical analysis to contigous adjacent base if available.
+            Extend statistical analysis to contiguous adjacent bases if available.
         * sequence_context_weights
             type of weights to used for combining p-values. {uniform,harmonic}
         * min_coverage
@@ -120,21 +122,10 @@ def __init__(self,
             raise NanocomporeError("The minimum number of threads is 3")
 
         # Parse comparison methods
-        if comparison_methods:
-            if type(comparison_methods) == str:
-                comparison_methods = comparison_methods.split(",")
-            for i, method in enumerate(comparison_methods):
-                method = method.upper()
-                if method in ["MANN_WHITNEY", "MW"]:
-                    comparison_methods[i] = "MW"
-                elif method in ["KOLMOGOROV_SMIRNOV", "KS"]:
-                    comparison_methods[i] = "KS"
-                elif method in ["STUDENT_T", "ST"]:
-                    comparison_methods[i] = "ST"
-                elif method in ["GAUSSIAN_MIXTURE_MODEL", "GMM"]:
-                    comparison_methods[i] = "GMM"
-                else:
-                    raise NanocomporeError("Invalid comparison method {}".format(method))
+        if univariate_test and (univariate_test not in ["MW", "KS", "ST"]):
+            raise NanocomporeError(f"Invalid univariate test {univariate_test}")
+        if fit_gmm and gmm_test and (gmm_test not in ["logit", "anova"]):
+            raise NanocomporeError(f"Invalid GMM-based test {gmm_test}")
 
         if not whitelist:
             whitelist = Whitelist(input_db_path,
@@ -165,9 +156,9 @@ def __init__(self,
         self.__sample_dict = sample_dict
         self.__fasta_fn = fasta_fn
         self.__whitelist = whitelist
-        self.__comparison_methods = comparison_methods
-        self.__logit = logit
-        self.__anova = anova
+        self.__univariate_test = univariate_test
+        self.__fit_gmm = fit_gmm
+        self.__gmm_test = gmm_test
         self.__allow_warnings = allow_warnings
         self.__sequence_context = sequence_context
         self.__sequence_context_weights = sequence_context_weights
@@ -265,18 +256,18 @@ def process_transcript(self, tx_id, whitelist_reads):
 
         logger.debug(f"Data loaded for transcript: {tx_id}")
         test_results = {}
-        if self.__comparison_methods:
+        if univariate_test or fit_gmm:
             random_state = np.random.RandomState(seed=42)
             test_results = txCompare(tx_id,
                                      kmer_data,
                                      random_state=random_state,
-                                     methods=self.__comparison_methods,
+                                     univariate_test=self.__univariate_test,
+                                     fit_gmm=self.__fit_gmm,
+                                     gmm_test=self.__gmm_test,
                                      sequence_context=self.__sequence_context,
                                      sequence_context_weights=self.__sequence_context_weights,
                                      min_coverage= self.__min_coverage,
-                                     allow_warnings=self.__allow_warnings,
-                                     logit=self.__logit,
-                                     anova=self.__anova)
+                                     allow_warnings=self.__allow_warnings)
 
         # Remove 'default_factory' functions from 'kmer_data' to enable pickle/multiprocessing
         kmer_data.default_factory = None
diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index 4ac78af..ae2b8f0 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -5,7 +5,6 @@
 from collections import OrderedDict, Counter, defaultdict
 import warnings
 
-
 # Third party
 from loguru import logger
 from scipy.stats import mannwhitneyu, ttest_ind, chi2, f_oneway
@@ -25,13 +24,13 @@
 def txCompare(ref_id,
               kmer_data,
               random_state,
-              methods=None,
+              univariate_test:str,
+              fit_gmm:bool,
+              gmm_test:str,
               sequence_context=0,
               min_coverage=20,
               ref=None,
               sequence_context_weights="uniform",
-              anova=True,
-              logit=False,
               allow_warnings=False):
     logger.debug("TxCompare")
 
@@ -39,12 +38,12 @@ def txCompare(ref_id,
         raise NanocomporeError("Invalid sequence_context_weights (uniform or harmonic)")
 
     n_lowcov = 0
-    tests = set()
     # If we have less than 2 replicates in any condition skip anova and force logit method
     # TODO: looking at the first kmer only may not be reliable - find a better way
-    if not all([len(samples) > 1 for samples in next(iter(kmer_data.values())).values()]):
-        anova = False
-        logit = True
+    if fit_gmm and (gmm_test == "anova") and not all([len(samples) > 1 for samples in
+                                                      next(iter(kmer_data.values())).values()]):
+        logger.warning("Not enough replicates for 'anova' GMM test. Switching to 'logit' test.")
+        gmm_test = "logit"
 
     results = {}
     for pos, pos_dict in kmer_data.items():
@@ -59,37 +58,29 @@ def txCompare(ref_id,
         res = {}
         condition_labels = tuple(pos_dict.keys())
         if len(condition_labels) != 2:
-            raise NanocomporeError("The %s method only supports two conditions" % method)
+            raise NanocomporeError("Need exactly two conditions for comparison")
         condition1_intensity = np.concatenate([ rep['intensity'] for rep in pos_dict[condition_labels[0]].values() ])
         condition2_intensity = np.concatenate([ rep['intensity'] for rep in pos_dict[condition_labels[1]].values() ])
         condition1_dwell = np.concatenate([ rep['dwell'] for rep in pos_dict[condition_labels[0]].values() ])
         condition2_dwell = np.concatenate([ rep['dwell'] for rep in pos_dict[condition_labels[1]].values() ])
 
-        for met in methods:
-            logger.trace(f"Running {met} test on position {pos}")
-            if met in ["MW", "KS", "ST"] :
-                try:
-                    pvalues = nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=met)
-                except:
-                    raise NanocomporeError("Error doing {} test on reference {}".format(met, ref_id))
-                res["{}_intensity_pvalue".format(met)]=pvalues[0]
-                res["{}_dwell_pvalue".format(met)]=pvalues[1]
-                tests.add("{}_intensity_pvalue".format(met))
-                tests.add("{}_dwell_pvalue".format(met))
-            elif met == "GMM":
-                try:
-                    gmm_results = gmm_test(pos_dict, anova=anova, logit=logit, allow_warnings=allow_warnings, random_state=random_state)
-                except:
-                    raise NanocomporeError("Error doing GMM test on reference {}".format(ref_id))
-                res["GMM_model"] = gmm_results['gmm']
-                if anova:
-                    res["GMM_anova_pvalue"] = gmm_results['anova']['pvalue']
-                    res["GMM_anova_model"] = gmm_results['anova']
-                    tests.add("GMM_anova_pvalue")
-                if logit:
-                    res["GMM_logit_pvalue"] = gmm_results['logit']['pvalue']
-                    res["GMM_logit_model"] = gmm_results['logit']
-                    tests.add("GMM_logit_pvalue")
+        if univariate_test:
+            logger.trace(f"Running {univariate_test} test on position {pos}")
+            try:
+                pvalues = nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=univariate_test)
+            except:
+                raise NanocomporeError(f"Error running {univariate_test} test on transcript {ref_id}")
+            res["intensity_pvalue"] = pvalues[0]
+            res["dwell_pvalue"] = pvalues[1]
+
+        if fit_gmm:
+            logger.trace(f"Fitting GMM on position {pos}")
+            try:
+                gmm_results = gmm_fit(pos_dict, test=gmm_test, allow_warnings=allow_warnings, random_state=random_state)
+            except:
+                raise NanocomporeError(f"Error running GMM test on transcript {ref_id}")
+            for key, value in gmm_results:
+                res["gmm_" + key] = value
 
         # Calculate shift statistics
         logger.trace(f"Calculatign shift stats for {pos}")
@@ -97,61 +88,63 @@ def txCompare(ref_id,
         # Save results in main
         logger.trace(f"Saving test results for {pos}")
         results[pos] = res
-    logger.debug("Skipped {} positions because not present in all samples with sufficient coverage".format(n_lowcov))
 
-    # Combine pvalue within a given sequence context
+    logger.debug(f"Skipped {n_lowcov} positions because not present in all samples with sufficient coverage")
+
     if sequence_context > 0:
-        logger.debug ("Calculate weights and cross correlation matrices by tests")
         if sequence_context_weights == "harmonic":
             # Generate weights as a symmetrical harmonic series
             weights = harmonic_series(sequence_context)
         else:
             weights = [1] * (2 * sequence_context + 1)
 
-        # Collect pvalue lists per tests
-        pval_list_dict = defaultdict(list)
-        for res_dict in results.values():
-            for test in tests:
-                pval_list_dict[test].append(res_dict[test])
-        # Compute cross correlation matrix per test
-        corr_matrix_dict = OrderedDict()
-        for test in tests:
-            corr_matrix_dict[test] = cross_corr_matrix(pval_list_dict[test], sequence_context)
-
-        logger.debug("Combine adjacent position pvalues with Hou's method position by position")
-        # Iterate over each position in previously generated result dictionary
-        for mid_pos, res_dict in results.items():
-            pval_list_dict = defaultdict(list)
-            for pos in range(mid_pos - sequence_context, mid_pos + sequence_context + 1):
-                # If any of the positions is missing or any of the p-values in the context is NaN, consider it 1
-                if pos not in results:
-                    for test in tests:
-                        pval_list_dict[test].append(1)
-                else:
-                    for test in tests:
-                        if np.isnan(results[pos][test]):
-                            pval_list_dict[test].append(1)
-                        else: # just extract the corresponding pvalue
-                            pval_list_dict[test].append(results[pos][test])
-            # Combine collected pvalues and add to dict
-            for test in tests:
-                test_label = "{}_context_{}".format(test, sequence_context)
-                # If the mid p-value is NaN, also set the context p-value to NaN
-                if np.isnan(res_dict[test]):
-                    res_dict[test_label] = np.nan
-                else:
-                    res_dict[test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test])
-
-    return results
+        if univariate_test:
+            combine_adjacent_pvalues(results, "intensity_pvalue", sequence_context, weights)
+            combine_adjacent_pvalues(results, "dwell_pvalue", sequence_context, weights)
+        if fit_gmm and gmm_test:
+            combine_adjacent_pvalues(results, "gmm_pvalue", sequence_context, weights)
+
+    params = {}
+
+    return (results, params)
+
+
+def combine_adjacent_pvalues(results, pvalue_key, sequence_context, weights):
+    logger.debug(f"Calculating cross correlation matrix for '{pvalue_key}'")
+    # Collect pvalue list for test
+    pval_list = []
+    for res_dict in results.values():
+        pval_list.append(res_dict.get(pvalue_key))
+        # Compute cross correlation matrix
+        corr_matrix = cross_corr_matrix(pval_list, sequence_context)
+
+    logger.debug("Combine adjacent position pvalues with Hou's method position by position")
+    combined_label = f"{pvalue_key}_context_{sequence_context}"
+    # Iterate over each position in previously generated result dictionary
+    for mid_pos, res_dict in results.items():
+        # If the mid p-value is NaN, also set the context p-value to NaN
+        if np.isnan(res_dict[pvalue_key]):
+            results[mid_pos][combined_label] = np.nan
+            continue
+        ## Otherwise collect adjacent p-values and combine them:
+        pval_list = []
+        for pos in range(mid_pos - sequence_context, mid_pos + sequence_context + 1):
+            # If any of the positions is missing or any of the p-values in the context is NaN, consider it 1
+            if (pos not in results) or np.isnan(results[pos][pvalue_key]):
+                pval_list.append(1)
+            else: # just extract the corresponding pvalue
+                pval_list.append(results[pos][pvalue_key])
+        # Combine collected pvalues and add to dict
+        results[mid_pos][combined_label] = combine_pvalues_hou(pval_list, weights, corr_matrix)
 
 
 def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=None):
-    if method in ["mann_whitney", "MW"]:
-        stat_test = lambda x,y: mannwhitneyu(x, y, alternative='two-sided')
-    elif method in ["kolmogorov_smirnov", "KS"]:
+    if method == "MW":
+        stat_test = lambda x, y: mannwhitneyu(x, y, alternative='two-sided')
+    elif method == "KS":
         stat_test = ks_twosamp
-    elif method in ["student_t", "ST"]:
-        stat_test = lambda x,y: ttest_ind(x, y, equal_var=False)
+    elif method == "ST":
+        stat_test = lambda x, y: ttest_ind(x, y, equal_var=False)
     else:
         raise NanocomporeError("Invalid statistical method name (MW, KS, ST)")
 
@@ -162,10 +155,10 @@ def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dw
     pval_dwell = stat_test(condition1_dwell, condition2_dwell)[1]
     if pval_dwell == 0:
         pval_dwell = np.finfo(np.float).tiny
-    return(pval_intensity, pval_dwell)
+    return (pval_intensity, pval_dwell)
 
 
-def gmm_test(data, random_state, anova=True, logit=False, verbose=True, allow_warnings=False):
+def gmm_fit(data, test, random_state, verbose=True, allow_warnings=False):
     # Condition labels
     condition_labels = tuple(data.keys())
     # List of sample labels
@@ -177,49 +170,41 @@ def gmm_test(data, random_state, anova=True, logit=False, verbose=True, allow_wa
     # Dictionary Sample_label:Condition_label
     sample_condition_labels = { sk:k for k,v in data.items() for sk in v.keys() }
     if len(condition_labels) != 2:
-        raise NanocomporeError("gmm_test only supports two conditions")
+        raise NanocomporeError("GMM fitting only supports two conditions")
 
     # Merge the intensities and dwell times of all samples in a single array
-    global_intensity = np.concatenate(([v['intensity'] for v in data[condition_labels[0]].values()]+[v['intensity'] for v in data[condition_labels[1]].values()]), axis=None)
-    global_dwell = np.concatenate(([v['dwell'] for v in data[condition_labels[0]].values()]+[v['dwell'] for v in data[condition_labels[1]].values()]), axis=None)
+    global_intensity = np.concatenate(([v['intensity'] for v in data[condition_labels[0]].values()] +
+                                       [v['intensity'] for v in data[condition_labels[1]].values()]), axis=None)
+    global_dwell = np.concatenate(([v['dwell'] for v in data[condition_labels[0]].values()] +
+                                   [v['dwell'] for v in data[condition_labels[1]].values()]), axis=None)
     global_dwell = np.log10(global_dwell)
 
     # Scale the intensity and dwell time arrays
-    X = StandardScaler().fit_transform([(i, d) for i,d in zip(global_intensity, global_dwell)])
+    X = StandardScaler().fit_transform([(i, d) for i, d in zip(global_intensity, global_dwell)])
 
     # Generate an array of sample labels
-    Y = [ k for k,v in data[condition_labels[0]].items() for _ in v['intensity'] ] + [ k for k,v in data[condition_labels[1]].items() for _ in v['intensity'] ]
+    Y = [k for k, v in data[condition_labels[0]].items() for _ in v['intensity']] + \
+        [k for k, v in data[condition_labels[1]].items() for _ in v['intensity']]
 
-    gmm_fit = fit_best_gmm(X, max_components=2, cv_types=['full'], random_state=random_state)
-    gmm_mod, gmm_type, gmm_ncomponents = gmm_fit
+    gmm_mod, gmm_type, gmm_ncomponents = fit_best_gmm(X, max_components=2, cv_types=['full'], random_state=random_state)
 
-    # If the best GMM has 2 clusters do an anova test on the log odd ratios
     if gmm_ncomponents == 2:
         # Assign data points to the clusters
         y_pred = gmm_mod.predict(X)
         counters = dict()
         # Count how many reads in each cluster for each sample
         for lab in sample_labels:
-            counters[lab] = Counter(y_pred[[i==lab for i in Y]])
+            counters[lab] = Counter(y_pred[[i == lab for i in Y]])
         cluster_counts = count_reads_in_cluster(counters)
-        if anova:
-            aov_results = gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings)
-        else:
-            aov_results = None
-
-        if logit:
-            logit_results = gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels)
-        else:
-            logit_results = None
-
-    elif gmm_ncomponents == 1:
-        aov_results = {'pvalue': np.nan, 'delta_logit': np.nan, 'table': None, 'cluster_counts': None}
-        logit_results = {'pvalue': np.nan, 'coef': None, 'model': None}
-        cluster_counts = None
+        if test == "anova":
+            pvalue, stat, details = gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings)
+        elif test == "logit":
+            pvalue, stat, details = gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels)
     else:
-        raise NanocomporeError("GMM models with n_component > 2 are not supported")
+        pvalue = stat = details = cluster_counts = None
 
-    return({'anova': aov_results, 'logit': logit_results, 'gmm': {'model': gmm_mod, 'cluster_counts': cluster_counts}})
+    return {"model": gmm_mod, "cluster_counts": cluster_counts, "pvalue": pvalue, "test_stat": stat,
+            "test_details": details}
 
 
 def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied', 'diag', 'full']):
@@ -239,7 +224,7 @@ def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied
                 best_gmm = gmm
                 best_gmm_type = cv_type
                 best_gmm_ncomponents = n_components
-    return((best_gmm, best_gmm_type, best_gmm_ncomponents))
+    return (best_gmm, best_gmm_type, best_gmm_ncomponents)
 
 
 def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings=False):
@@ -259,7 +244,7 @@ def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncom
     logr_s1 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[0]]
     logr_s2 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[1]]
     # If the SS for either array is 0, skip the anova test
-    if sum_of_squares(logr_s1-np.mean(logr_s1)) == 0 and sum_of_squares(logr_s2 - np.mean(logr_s2)) == 0:
+    if sum_of_squares(logr_s1 - np.mean(logr_s1)) == 0 and sum_of_squares(logr_s2 - np.mean(logr_s2)) == 0:
         if not allow_warnings:
             raise NanocomporeError("While doing the Anova test we found a sample with within variance = 0. Use --allow_warnings to ignore.")
         else:
@@ -283,8 +268,8 @@ def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncom
         raise NanocomporeError("The Anova test returned a p-value of 0. This is most likely an error somewhere")
     # Calculate the delta log odds ratio, i.e. the difference of the means of the log odds ratios between the two conditions
     aov_delta_logit = float(np.mean(logr_s1) - np.mean(logr_s2))
-    aov_results = {'pvalue': aov_pvalue, 'delta_logit': aov_delta_logit, 'table': aov_table, 'log_ratios':logr}
-    return(aov_results)
+    aov_details = {'table': aov_table, 'log_ratios': logr}
+    return (aov_pvalue, aov_delta_logit, aov_details)
 
 
 def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
@@ -293,26 +278,26 @@ def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
     Y.extend([condition_labels[0], condition_labels[1], condition_labels[0], condition_labels[1]])
     Y = pd.get_dummies(Y)
     Y['intercept']=1
-    logit = dm.Logit(y_pred,Y[['intercept',condition_labels[1]]] )
+    logit = dm.Logit(y_pred,Y[['intercept', condition_labels[1]]] )
     with warnings.catch_warnings():
         warnings.filterwarnings('error')
         try:
-            logit_mod=logit.fit(disp=0)
+            logit_mod = logit.fit(disp=0)
             logit_pvalue, logit_coef = logit_mod.pvalues[1], logit_mod.params[1]
         except ConvergenceWarning:
             logit_mod, logit_pvalue, logit_coef = None, 1, None
     if logit_pvalue == 0:
         logit_pvalue = np.finfo(np.float).tiny
-    logit_results = {'pvalue': logit_pvalue, 'coef': logit_coef, 'model': logit_mod}
-    return(logit_results)
+    logit_details = {'model': logit_mod}
+    return (logit_pvalue, logit_coef, logit_details)
 
 
 def count_reads_in_cluster(counters):
     cluster_counts = list()
-    for k,v in counters.items():
+    for k, v in counters.items():
         cluster_counts.append("%s:%s/%s" % (k, v[0], v[1]))
     cluster_counts="__".join(cluster_counts)
-    return(cluster_counts)
+    return cluster_counts
 
 
 def shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell):
@@ -331,7 +316,7 @@ def shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, co
         ('c1_sd_dwell', np.std(condition1_dwell)),
         ('c2_sd_dwell', np.std(condition2_dwell))
     ])
-    return(shift_stats)
+    return shift_stats
 
 
 def cross_corr_matrix(pvalues_vector, context=2):
@@ -339,24 +324,24 @@ def cross_corr_matrix(pvalues_vector, context=2):
         pvalues for a given context.
     """
     if len(pvalues_vector) < (context * 3) + 3:
-        raise NanocomporeError("Not enough p-values for a context of order %s"%context)
+        raise NanocomporeError("Not enough p-values for a context of order %s" % context)
 
-    pvalues_vector = np.array([ i if not np.isnan(i) else 1 for i in pvalues_vector ])
+    pvalues_vector = np.array([i if not np.isnan(i) else 1 for i in pvalues_vector])
     if any(pvalues_vector == 0) or any(np.isinf(pvalues_vector)) or any(pvalues_vector > 1):
         raise NanocomporeError("At least one p-value is invalid")
 
     matrix = []
     s = pvalues_vector.size
     if all(p == 1 for p in pvalues_vector):
-        return(np.ones((context * 2 + 1, context * 2 + 1)))
+        return np.ones((context * 2 + 1, context * 2 + 1))
 
     for i in range(-context, context + 1):
         row = []
-        for j in range(-context , context + 1):
+        for j in range(-context, context + 1):
             row.append(np.corrcoef((np.roll(pvalues_vector, i)[context:s - context]),
                                    (np.roll(pvalues_vector, j)[context:s - context]))[0][1])
         matrix.append(row)
-    return(np.array(matrix))
+    return np.array(matrix)
 
 
 def combine_pvalues_hou(pvalues, weights, cor_mat):

From a7e92695cddc1d96ed9f8d50ddca40e02d27beab Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 12:07:12 +0100
Subject: [PATCH 29/49] DataStore: improve definition of SQL tables

---
 nanocompore/DataStore.py | 201 ++++++++++++++++++---------------------
 1 file changed, 95 insertions(+), 106 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 5dae718..1e8cd32 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -22,7 +22,7 @@ class DBCreateMode(Enum):
 class DataStore(object):
     """Store Nanocompore data in an SQLite database - base class"""
 
-    create_tables_queries = {} # table name -> SQL query (to be filled by derived classes)
+    table_defs = {} # table name -> column definitions (to be filled by derived classes)
 
     def __init__(self,
                  db_path:str,
@@ -33,10 +33,13 @@ def __init__(self,
         self._cursor = None
 
     def _init_db(self):
-        if self.create_tables_queries:
+        if self.table_defs:
             logger.debug("Setting up database tables")
             try:
-                for table, query in self.create_tables_queries.items():
+                for table, column_defs in self.table_defs.items():
+                    if type(column_defs) is not str: # list/tuple expected
+                        column_defs = ", ".join(column_defs)
+                    query = f"CREATE TABLE IF NOT EXISTS {table} ({column_defs})"
                     self._cursor.execute(query)
                 self._connection.commit()
             except:
@@ -82,62 +85,54 @@ def cursor(self):
 class DataStore_EventAlign(DataStore):
     """Store Nanocompore data in an SQLite database - subclass for Eventalign_collapse results"""
 
-    create_reads_query = ("CREATE TABLE IF NOT EXISTS reads ("
-                          "id INTEGER NOT NULL PRIMARY KEY,"
-                          "name VARCHAR NOT NULL UNIQUE,"
-                          "sampleid INTEGER NOT NULL,"
-                          "transcriptid VARCHAR NOT NULL,"
-                          "refstart INT NOT NULL,"
-                          "refend INT NOT NULL,"
-                          "numevents INT NOT NULL,"
-                          "numsignals INT NOT NULL,"
-                          "dwelltime REAL NOT NULL,"
-                          "kmers INT NOT NULL,"
-                          "missing_kmers INT NOT NULL,"
-                          "NNNNN_kmers INT NOT NULL,"
-                          "mismatch_kmers INT NOT NULL,"
-                          "valid_kmers INT NOT NULL,"
-                          "FOREIGN KEY(sampleid) REFERENCES samples(id)"
-                          "FOREIGN KEY(transcriptid) REFERENCES transcripts(id)"
-                          ")"
-                          )
-
-    create_kmers_query = ("CREATE TABLE IF NOT EXISTS kmers ("
-                          "id INTEGER NOT NULL PRIMARY KEY,"
-                          "readid INTEGER NOT NULL,"
-                          "position INTEGER NOT NULL,"
-                          "sequence INTEGER NOT NULL,"
-                          "num_events INTEGER NOT NULL,"
-                          "num_signals INTEGER NOT NULL,"
-                          "status VARCHAR NOT NULL,"
-                          "dwell_time REAL NOT NULL,"
-                          "NNNNN_dwell_time REAL NOT NULL,"
-                          "mismatch_dwell_time REAL NOT NULL,"
-                          "median REAL NOT NULL,"
-                          "mad REAL NOT NULL,"
-                          "FOREIGN KEY(readid) REFERENCES reads(id)"
-                          ")"
-                          )
+    # "reads" table:
+    table_def_reads = ["id INTEGER NOT NULL PRIMARY KEY",
+                       "name VARCHAR NOT NULL UNIQUE",
+                       "sampleid INTEGER NOT NULL",
+                       "transcriptid VARCHAR NOT NULL",
+                       "refstart INT NOT NULL",
+                       "refend INT NOT NULL",
+                       "numevents INT NOT NULL",
+                       "numsignals INT NOT NULL",
+                       "dwelltime REAL NOT NULL",
+                       "kmers INT NOT NULL",
+                       "missing_kmers INT NOT NULL",
+                       "NNNNN_kmers INT NOT NULL",
+                       "mismatch_kmers INT NOT NULL",
+                       "valid_kmers INT NOT NULL",
+                       "FOREIGN KEY(sampleid) REFERENCES samples(id)",
+                       "FOREIGN KEY(transcriptid) REFERENCES transcripts(id)"]
+
+    # "kmers" table:
+    table_def_kmers = ["id INTEGER NOT NULL PRIMARY KEY",
+                       "readid INTEGER NOT NULL",
+                       "position INTEGER NOT NULL",
+                       "sequence INTEGER NOT NULL",
+                       "num_events INTEGER NOT NULL",
+                       "num_signals INTEGER NOT NULL",
+                       "status VARCHAR NOT NULL",
+                       "dwell_time REAL NOT NULL",
+                       "NNNNN_dwell_time REAL NOT NULL",
+                       "mismatch_dwell_time REAL NOT NULL",
+                       "median REAL NOT NULL",
+                       "mad REAL NOT NULL",
+                       "FOREIGN KEY(readid) REFERENCES reads(id)"]
     # TODO: 'sequence' is stored redundantly - move it to a separate table
     # TODO: encode 'status' as int to save space (foreign key referencing a table with all possible statuses)
 
-    create_samples_query = ("CREATE TABLE IF NOT EXISTS samples ("
-                            "id INTEGER NOT NULL PRIMARY KEY,"
-                            "name VARCHAR NOT NULL UNIQUE,"
-                            "condition VARCHAR"
-                            ")"
-                            )
+    # "samples" table:
+    table_def_samples = ["id INTEGER NOT NULL PRIMARY KEY",
+                         "name VARCHAR NOT NULL UNIQUE",
+                         "condition VARCHAR"]
 
-    create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
-                                "id INTEGER NOT NULL PRIMARY KEY,"
-                                "name VARCHAR NOT NULL UNIQUE"
-                                ")"
-                                )
+    # "transcripts" table:
+    table_def_transcripts = ["id INTEGER NOT NULL PRIMARY KEY",
+                             "name VARCHAR NOT NULL UNIQUE"]
 
-    create_tables_queries = {"reads": create_reads_query,
-                             "kmers": create_kmers_query,
-                             "samples": create_samples_query,
-                             "transcripts": create_transcripts_query}
+    table_defs = {"reads": table_def_reads,
+                  "kmers": table_def_kmers,
+                  "samples": table_def_samples,
+                  "transcripts": table_def_transcripts}
 
     def store_read(self, read):
         """
@@ -285,63 +280,57 @@ def store_sample_info(self, sample_dict):
 class DataStore_SampComp(DataStore):
     """Store Nanocompore data in an SQLite database - subclass for SampComp results"""
 
+    # "parameters" table:
+    table_def_parameters = ["univariate_test VARCHAR CHECK (univariate_test in ('ST', 'MW', 'KS'))",
+                            "gmm_covariance_type VARCHAR",
+                            "gmm_test VARCHAR CHECK (gmm_test in ('anova', 'logit'))"]
     # TODO: add more parameters
-    create_parameters_query = ("CREATA TABLE IF NOT EXISTS parameters ("
-                               "univariate_test VARCHAR CHECK (univariate_test in ('ST', 'MW', 'KS')),"
-                               "gmm_covariance_type VARCHAR,"
-                               "gmm_test VARCHAR CHECK (gmm_test in ('anova', 'logit'))"
-                               ")")
-
-    create_transcripts_query = ("CREATE TABLE IF NOT EXISTS transcripts ("
-                                "id INTEGER NOT NULL PRIMARY KEY,"
-                                "name VARCHAR NOT NULL UNIQUE"
-                                ")")
-
-    create_whitelist_query = ("CREATE TABLE IF NOT EXISTS whitelist ("
-                              "transcriptid INTEGER NOT NULL,"
-                              "readid INTEGER NOT NULL UNIQUE,"
-                              "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
-                              # "readid" is foreign key for "reads" table in EventAlign DB
-                              ")")
+
+    # "transcripts" table:
+    table_def_transcripts = ["id INTEGER NOT NULL PRIMARY KEY",
+                             "name VARCHAR NOT NULL UNIQUE"]
+
+    # "whitelist" table:
+    table_def_whitelist = ["transcriptid INTEGER NOT NULL",
+                           "readid INTEGER NOT NULL UNIQUE", # foreign key for "reads" table in EventAlign DB
+                           "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"]
 
     # TODO: add columns for adjusted p-values in tables below?
-    create_kmer_stats_query = ("CREATE TABLE IF NOT EXISTS kmer_stats ("
-                               "id INTEGER NOT NULL PRIMARY KEY,"
-                               "transcriptid INTEGER NOT NULL,"
-                               "kmer INTEGER NOT NULL,"
-                               "c1_mean_intensity REAL,"
-                               "c2_mean_intensity REAL,"
-                               "c1_median_intensity REAL,"
-                               "c2_median_intensity REAL,"
-                               "c1_sd_intensity REAL,"
-                               "c2_sd_intensity REAL,"
-                               "c1_mean_dwell REAL,"
-                               "c2_mean_dwell REAL,"
-                               "c1_median_dwell REAL,"
-                               "c2_median_dwell REAL,"
-                               "c1_sd_dwell REAL,"
-                               "c2_sd_dwell REAL,"
-                               "intensity_pvalue REAL,"
-                               "dwell_pvalue REAL,"
-                               "UNIQUE (transcriptid, kmer),"
-                               "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"
-                               ")")
+    # "kmer_stats" table:
+    table_def_kmer_stats = ["id INTEGER NOT NULL PRIMARY KEY",
+                            "transcriptid INTEGER NOT NULL",
+                            "kmer INTEGER NOT NULL",
+                            "c1_mean_intensity REAL",
+                            "c2_mean_intensity REAL",
+                            "c1_median_intensity REAL",
+                            "c2_median_intensity REAL",
+                            "c1_sd_intensity REAL",
+                            "c2_sd_intensity REAL",
+                            "c1_mean_dwell REAL",
+                            "c2_mean_dwell REAL",
+                            "c1_median_dwell REAL",
+                            "c2_median_dwell REAL",
+                            "c1_sd_dwell REAL",
+                            "c2_sd_dwell REAL",
+                            "intensity_pvalue REAL",
+                            "dwell_pvalue REAL",
+                            "UNIQUE (transcriptid, kmer)",
+                            "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"]
     # TODO: are "c1" and "c2" (conditions) properly defined?
 
-    create_gmm_stats_query = ("CREATE TABLE IF NOT EXISTS gmm_stats ("
-                              "kmer_statsid INTEGER NOT NULL UNIQUE,"
-                              "n_components INTEGER NOT NULL,"
-                              "cluster_counts VARCHAR,"
-                              "test_stat REAL,"
-                              "test_pvalue REAL,"
-                              "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"
-                              ")")
-
-    create_tables_queries = {"parameters": create_parameters_query,
-                             "transcripts": create_transcripts_query,
-                             "whitelist": create_whitelist_query,
-                             "kmer_stats": create_kmer_stats_query,
-                             "gmm_stats": create_gmm_stats_query}
+    # "gmm_stats" table:
+    table_def_gmm_stats = ["kmer_statsid INTEGER NOT NULL UNIQUE",
+                           "n_components INTEGER NOT NULL",
+                           "cluster_counts VARCHAR",
+                           "test_stat REAL",
+                           "test_pvalue REAL",
+                           "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"]
+
+    table_defs = {"parameters": table_def_parameters,
+                  "transcripts": table_def_transcripts,
+                  "whitelist": table_def_whitelist,
+                  "kmer_stats": table_def_kmer_stats,
+                  "gmm_stats": table_def_gmm_stats}
 
     def __insert_transcript_get_id(self, tx_name):
         try:

From 5b35d301174a8a28513efc02684a1984d79144ea Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 14:01:33 +0100
Subject: [PATCH 30/49] TxComp: cosmetic changes

---
 nanocompore/TxComp.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index ae2b8f0..d740382 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -30,11 +30,11 @@ def txCompare(ref_id,
               sequence_context=0,
               min_coverage=20,
               ref=None,
-              sequence_context_weights="uniform",
+              sequence_context_weights="uniform", # or: "harmonic"
               allow_warnings=False):
     logger.debug("TxCompare")
 
-    if sequence_context_weights != "uniform" and sequence_context_weights != "harmonic":
+    if sequence_context_weights not in ["uniform", "harmonic"]:
         raise NanocomporeError("Invalid sequence_context_weights (uniform or harmonic)")
 
     n_lowcov = 0
@@ -104,9 +104,7 @@ def txCompare(ref_id,
         if fit_gmm and gmm_test:
             combine_adjacent_pvalues(results, "gmm_pvalue", sequence_context, weights)
 
-    params = {}
-
-    return (results, params)
+    return results
 
 
 def combine_adjacent_pvalues(results, pvalue_key, sequence_context, weights):

From 833cc2ef6947f5da202d4610ba477705f165e547 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 14:04:19 +0100
Subject: [PATCH 31/49] DataStore/SampComp: add DB columns for adj. p-values,
 adapt DB schema based on processing options

---
 nanocompore/DataStore.py | 49 ++++++++++++++++++++++++++++++----------
 nanocompore/SampComp.py  | 11 ++++++---
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 1e8cd32..2a91f5a 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -295,7 +295,6 @@ class DataStore_SampComp(DataStore):
                            "readid INTEGER NOT NULL UNIQUE", # foreign key for "reads" table in EventAlign DB
                            "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"]
 
-    # TODO: add columns for adjusted p-values in tables below?
     # "kmer_stats" table:
     table_def_kmer_stats = ["id INTEGER NOT NULL PRIMARY KEY",
                             "transcriptid INTEGER NOT NULL",
@@ -314,6 +313,8 @@ class DataStore_SampComp(DataStore):
                             "c2_sd_dwell REAL",
                             "intensity_pvalue REAL",
                             "dwell_pvalue REAL",
+                            "adj_intensity_pvalue REAL",
+                            "adj_dwell_pvalue REAL",
                             "UNIQUE (transcriptid, kmer)",
                             "FOREIGN KEY (transcriptid) REFERENCES transcripts(id)"]
     # TODO: are "c1" and "c2" (conditions) properly defined?
@@ -324,13 +325,34 @@ class DataStore_SampComp(DataStore):
                            "cluster_counts VARCHAR",
                            "test_stat REAL",
                            "test_pvalue REAL",
+                           "adj_test_pvalue REAL",
                            "FOREIGN KEY (kmer_statsid) REFERENCES kmer_stats(id)"]
 
     table_defs = {"parameters": table_def_parameters,
                   "transcripts": table_def_transcripts,
                   "whitelist": table_def_whitelist,
-                  "kmer_stats": table_def_kmer_stats,
-                  "gmm_stats": table_def_gmm_stats}
+                  "kmer_stats": table_def_kmer_stats}
+    # table "gmm_stats" is only added when needed (see "__init__")
+
+    def __init__(self,
+                 db_path:str,
+                 create_mode=DBCreateMode.MUST_EXIST,
+                 with_gmm=True,
+                 with_sequence_context=False):
+        super().__init__(db_path, create_mode)
+        self.__with_gmm = with_gmm
+        self.__with_sequence_context = with_sequence_context
+        if with_gmm:
+            table_defs["gmm_stats"] = table_def_gmm_stats
+        if with_sequence_context: # add additional columns for context p-values
+            table_defs["kmer_stats"] += ["intensity_pvalue_context REAL",
+                                         "dwell_pvalue_context REAL",
+                                         "adj_intensity_pvalue_context REAL",
+                                         "adj_dwell_pvalue_context REAL"]
+            if with_gmm:
+                table_defs["gmm_stats"] += ["test_pvalue_context REAL",
+                                            "adj_test_pvalue_context REAL"]
+
 
     def __insert_transcript_get_id(self, tx_name):
         try:
@@ -354,22 +376,25 @@ def store_test_results(self, tx_name, test_results):
         for kmer, res in test_results.items():
             values = [tx_id, kmer]
             values += res["shift_stats"].values()
-            values.append(res.get("intensity_pvalue"))
-            values.append(res.get("dwell_pvalue"))
+            # insert 'None' (NULL) into adj. p-value columns:
+            values += [res.get("intensity_pvalue"), res.get("dwell_pvalue"), None, None]
+            if self.__with_sequence_context:
+                values += [res.get("intensity_pvalue_context"), res.get("dwell_pvalue_context"), None, None]
             try:
                 self._cursor.execute("INSERT INTO kmer_stats VALUES (NULL" + ", ?" * len(values) + ")", values)
             except:
                 logger.error(f"Error storing statistics for transcript '{tx_name}', kmer {kmer}")
                 raise
             kmer_statsid = self._cursor.lastrowid
-            if "gmm_model" in res:
+            if self.__with_gmm:
+                # insert 'None' (NULL) into adj. p-value columns:
+                values = [kmer_statsid, res["gmm_model"].n_components, res["gmm_cluster_counts"],
+                          res["gmm_test_stat"], res["gmm_pvalue"], None]
+                if self.__with_sequence_context:
+                    values += [res["gmm_pvalue_context"], None]
+                qmarks = ", ".join(["?"] * len(values))
                 try:
-                    self._cursor.execute("INSERT INTO gmm_stats VALUES (?, ?, ?, ?)",
-                                         (kmer_statsid,
-                                          res["gmm_model"].n_components,
-                                          res["gmm_cluster_counts"],
-                                          res["gmm_test_stat"],
-                                          res["gmm_pvalue"]))
+                    self._cursor.execute(f"INSERT INTO gmm_stats VALUES ({qmarks})", values)
                 except:
                     logger.error(f"Error storing GMM stats for transcript '{tx_name}', kmer {kmer}")
                     raise
diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index e67d6f4..a6854d9 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -28,7 +28,7 @@
 os.environ["MKL_THREADING_LAYER"] = "sequential"
 os.environ["NUMEXPR_NUM_THREADS"] = "1"
 os.environ["OMP_NUM_THREADS"] = "1"
-os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ["OPENBLAS_NUM_THREADS"] = "1"
 
 #~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~#
 class SampComp(object):
@@ -142,7 +142,9 @@ def __init__(self,
 
         self.__output_db_path = output_db_path
         db_create_mode = DBCreateMode.OVERWRITE if overwrite else DBCreateMode.CREATE_MAYBE
-        with DataStore_SampComp(self.__output_db_path, db_create_mode) as db:
+        db = DataStore_SampComp(output_db_path, db_create_mode, with_gmm=fit_gmm,
+                                with_sequence_context=(sequence_context > 0))
+        with db:
             db.store_whitelist(whitelist)
         # TODO: move this to '__call__'?
 
@@ -337,7 +339,10 @@ def __write_output_to_db(self, out_q, error_q):
         n_tx = 0
         try:
             # Database was already created earlier to store the whitelist!
-            with DataStore_SampComp(self.__output_db_path, DBCreateMode.MUST_EXIST) as db:
+            db = DataStore_SampComp(self.__output_db_path, DBCreateMode.MUST_EXIST,
+                                    with_gmm=self.__fit_gmm,
+                                    with_sequence_context=(self.__sequence_context > 0))
+            with  as db:
                 # Iterate over the counter queue and process items until all poison pills are found
                 for _ in range(self.__nthreads):
                     for ref_id, kmer_data, test_results in iter(out_q.get, None):

From 2a35003453c5d8d97a30678e0ea2711b983d63e5 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 16:30:12 +0100
Subject: [PATCH 32/49] TxComp: combine collection of functions into class
 'TxComp'

---
 nanocompore/TxComp.py | 793 ++++++++++++++++++++++--------------------
 1 file changed, 408 insertions(+), 385 deletions(-)

diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index d740382..230cd32 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -20,394 +20,417 @@
 from nanocompore.common import *
 
 
-# TODO: wrap this in a class
-def txCompare(ref_id,
-              kmer_data,
-              random_state,
-              univariate_test:str,
-              fit_gmm:bool,
-              gmm_test:str,
-              sequence_context=0,
-              min_coverage=20,
-              ref=None,
-              sequence_context_weights="uniform", # or: "harmonic"
-              allow_warnings=False):
-    logger.debug("TxCompare")
-
-    if sequence_context_weights not in ["uniform", "harmonic"]:
-        raise NanocomporeError("Invalid sequence_context_weights (uniform or harmonic)")
-
-    n_lowcov = 0
-    # If we have less than 2 replicates in any condition skip anova and force logit method
-    # TODO: looking at the first kmer only may not be reliable - find a better way
-    if fit_gmm and (gmm_test == "anova") and not all([len(samples) > 1 for samples in
-                                                      next(iter(kmer_data.values())).values()]):
-        logger.warning("Not enough replicates for 'anova' GMM test. Switching to 'logit' test.")
-        gmm_test = "logit"
-
-    results = {}
-    for pos, pos_dict in kmer_data.items():
-        logger.trace(f"Processing position {pos}")
-        # Filter out low coverage positions
-        if has_low_coverage(pos_dict, min_coverage):
-            logger.trace(f"Position {pos} has low coverage, skipping")
-            n_lowcov += 1
-            continue
-
-        # Perform stat tests
-        res = {}
-        condition_labels = tuple(pos_dict.keys())
-        if len(condition_labels) != 2:
-            raise NanocomporeError("Need exactly two conditions for comparison")
-        condition1_intensity = np.concatenate([ rep['intensity'] for rep in pos_dict[condition_labels[0]].values() ])
-        condition2_intensity = np.concatenate([ rep['intensity'] for rep in pos_dict[condition_labels[1]].values() ])
-        condition1_dwell = np.concatenate([ rep['dwell'] for rep in pos_dict[condition_labels[0]].values() ])
-        condition2_dwell = np.concatenate([ rep['dwell'] for rep in pos_dict[condition_labels[1]].values() ])
-
-        if univariate_test:
-            logger.trace(f"Running {univariate_test} test on position {pos}")
-            try:
-                pvalues = nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=univariate_test)
-            except:
-                raise NanocomporeError(f"Error running {univariate_test} test on transcript {ref_id}")
-            res["intensity_pvalue"] = pvalues[0]
-            res["dwell_pvalue"] = pvalues[1]
-
-        if fit_gmm:
-            logger.trace(f"Fitting GMM on position {pos}")
-            try:
-                gmm_results = gmm_fit(pos_dict, test=gmm_test, allow_warnings=allow_warnings, random_state=random_state)
-            except:
-                raise NanocomporeError(f"Error running GMM test on transcript {ref_id}")
-            for key, value in gmm_results:
-                res["gmm_" + key] = value
-
-        # Calculate shift statistics
-        logger.trace(f"Calculatign shift stats for {pos}")
-        res["shift_stats"] = shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell)
-        # Save results in main
-        logger.trace(f"Saving test results for {pos}")
-        results[pos] = res
-
-    logger.debug(f"Skipped {n_lowcov} positions because not present in all samples with sufficient coverage")
-
-    if sequence_context > 0:
-        if sequence_context_weights == "harmonic":
-            # Generate weights as a symmetrical harmonic series
-            weights = harmonic_series(sequence_context)
+class TxComp(object):
+    """Compare transcript data from two samples using statistical methods"""
+
+    def __init__(self,
+                 random_state,
+                 univariate_test:str,
+                 fit_gmm:bool,
+                 gmm_test:str,
+                 sequence_context:int=0,
+                 sequence_context_weighting:str="uniform", # or: "harmonic"
+                 min_coverage:int=20,
+                 allow_anova_warnings:bool=False):
+        self.__random_state = random_state
+        self.__univariate_test = univariate_test
+        self.__fit_gmm = fit_gmm
+        self.__gmm_test = gmm_test
+        self.__min_coverage = min_coverage
+        self.__sequence_context = sequence_context
+        if sequence_context > 0:
+            if sequence_context_weighting == "harmonic":
+                # Generate weights as a symmetrical harmonic series
+                self.__sequence_context_weights = harmonic_series(self.__sequence_context)
+            elif sequence_context_weighting == "uniform":
+                self.__sequence_context_weights = [1] * (2 * self.__sequence_context + 1)
+            else:
+                raise NanocomporeError("Invalid sequence context weighting ('uniform' or 'harmonic')")
+        self.__allow_anova_warnings = allow_anova_warnings
+        self.gmm_anova_failed = False
+
+
+    def __call__(self, ref_id, kmer_data):
+        """Perform comparisons for one transcript ('ref_id') given k-mer data"""
+        logger.debug("TxComp()")
+
+        n_lowcov = 0
+        # If we have less than 2 replicates in any condition skip anova and force logit method
+        # TODO: looking at the first kmer only may not be reliable - find a better way
+        if self.__fit_gmm and (self.__gmm_test == "anova") and \
+           not all([len(samples) > 1 for samples in next(iter(kmer_data.values())).values()]):
+            logger.warning("Not enough replicates for 'anova' GMM test. Switching to 'logit' test.")
+            self.__gmm_test = "logit"
+            self.gmm_anova_failed = True
         else:
-            weights = [1] * (2 * sequence_context + 1)
-
-        if univariate_test:
-            combine_adjacent_pvalues(results, "intensity_pvalue", sequence_context, weights)
-            combine_adjacent_pvalues(results, "dwell_pvalue", sequence_context, weights)
-        if fit_gmm and gmm_test:
-            combine_adjacent_pvalues(results, "gmm_pvalue", sequence_context, weights)
-
-    return results
-
-
-def combine_adjacent_pvalues(results, pvalue_key, sequence_context, weights):
-    logger.debug(f"Calculating cross correlation matrix for '{pvalue_key}'")
-    # Collect pvalue list for test
-    pval_list = []
-    for res_dict in results.values():
-        pval_list.append(res_dict.get(pvalue_key))
-        # Compute cross correlation matrix
-        corr_matrix = cross_corr_matrix(pval_list, sequence_context)
-
-    logger.debug("Combine adjacent position pvalues with Hou's method position by position")
-    combined_label = f"{pvalue_key}_context_{sequence_context}"
-    # Iterate over each position in previously generated result dictionary
-    for mid_pos, res_dict in results.items():
-        # If the mid p-value is NaN, also set the context p-value to NaN
-        if np.isnan(res_dict[pvalue_key]):
-            results[mid_pos][combined_label] = np.nan
-            continue
-        ## Otherwise collect adjacent p-values and combine them:
+            self.gmm_anova_failed = False
+
+        results = {}
+        for pos, pos_dict in kmer_data.items():
+            logger.trace(f"Processing position {pos}")
+            # Filter out low coverage positions
+            if self.__has_low_coverage(pos_dict):
+                logger.trace(f"Position {pos} has low coverage, skipping")
+                n_lowcov += 1
+                continue
+
+            # Perform stat tests
+            res = {}
+            condition_labels = tuple(pos_dict.keys())
+            if len(condition_labels) != 2:
+                raise NanocomporeError("Need exactly two conditions for comparison")
+            condition1_intensity = np.concatenate([rep['intensity'] for rep in pos_dict[condition_labels[0]].values()])
+            condition2_intensity = np.concatenate([rep['intensity'] for rep in pos_dict[condition_labels[1]].values()])
+            condition1_dwell = np.concatenate([rep['dwell'] for rep in pos_dict[condition_labels[0]].values()])
+            condition2_dwell = np.concatenate([rep['dwell'] for rep in pos_dict[condition_labels[1]].values()])
+
+            if self.__univariate_test:
+                logger.trace(f"Running {self.__univariate_test} test on position {pos}")
+                try:
+                    pvalues = self.__nonparametric_test(condition1_intensity, condition2_intensity,
+                                                        condition1_dwell, condition2_dwell)
+                except:
+                    raise NanocomporeError(f"Error running {self.__univariate_test} test on transcript {ref_id}")
+                res["intensity_pvalue"] = pvalues[0]
+                res["dwell_pvalue"] = pvalues[1]
+
+            if self.__fit_gmm:
+                logger.trace(f"Fitting GMM on position {pos}")
+                try:
+                    gmm_results = self.__gmm_fit(pos_dict)
+                except:
+                    raise NanocomporeError(f"Error running GMM test on transcript {ref_id}")
+                for key, value in gmm_results.items():
+                    res["gmm_" + key] = value
+
+            # Calculate shift statistics
+            logger.trace(f"Calculatign shift stats for {pos}")
+            res["shift_stats"] = self.__shift_stats(condition1_intensity, condition2_intensity,
+                                                    condition1_dwell, condition2_dwell)
+            # Save results in main
+            logger.trace(f"Saving test results for {pos}")
+            results[pos] = res
+
+        logger.debug(f"Skipped {n_lowcov} positions because not present in all samples with sufficient coverage")
+
+        if self.__sequence_context > 0:
+            if self.__univariate_test:
+                self.__combine_adjacent_pvalues(results, "intensity_pvalue")
+                self.__combine_adjacent_pvalues(results, "dwell_pvalue")
+            if self.__fit_gmm and self.__gmm_test:
+                self.__combine_adjacent_pvalues(results, "gmm_pvalue")
+
+        return results
+
+
+    def __combine_adjacent_pvalues(self, results, pvalue_key):
+        logger.debug(f"Calculating cross correlation matrix for '{pvalue_key}'")
+        # Collect pvalue list for test
         pval_list = []
-        for pos in range(mid_pos - sequence_context, mid_pos + sequence_context + 1):
-            # If any of the positions is missing or any of the p-values in the context is NaN, consider it 1
-            if (pos not in results) or np.isnan(results[pos][pvalue_key]):
-                pval_list.append(1)
-            else: # just extract the corresponding pvalue
-                pval_list.append(results[pos][pvalue_key])
-        # Combine collected pvalues and add to dict
-        results[mid_pos][combined_label] = combine_pvalues_hou(pval_list, weights, corr_matrix)
-
-
-def nonparametric_test(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell, method=None):
-    if method == "MW":
-        stat_test = lambda x, y: mannwhitneyu(x, y, alternative='two-sided')
-    elif method == "KS":
-        stat_test = ks_twosamp
-    elif method == "ST":
-        stat_test = lambda x, y: ttest_ind(x, y, equal_var=False)
-    else:
-        raise NanocomporeError("Invalid statistical method name (MW, KS, ST)")
-
-    pval_intensity = stat_test(condition1_intensity, condition2_intensity)[1]
-    if pval_intensity == 0:
-        pval_intensity = np.finfo(np.float).tiny
-
-    pval_dwell = stat_test(condition1_dwell, condition2_dwell)[1]
-    if pval_dwell == 0:
-        pval_dwell = np.finfo(np.float).tiny
-    return (pval_intensity, pval_dwell)
-
-
-def gmm_fit(data, test, random_state, verbose=True, allow_warnings=False):
-    # Condition labels
-    condition_labels = tuple(data.keys())
-    # List of sample labels
-    sample_labels = list(data[condition_labels[0]].keys()) + list(data[condition_labels[1]].keys())
-
-    if len(sample_labels) != len(set(sample_labels)):
-        raise NanocomporeError("Sample labels have to be unique and it looks like some are not.")
-
-    # Dictionary Sample_label:Condition_label
-    sample_condition_labels = { sk:k for k,v in data.items() for sk in v.keys() }
-    if len(condition_labels) != 2:
-        raise NanocomporeError("GMM fitting only supports two conditions")
-
-    # Merge the intensities and dwell times of all samples in a single array
-    global_intensity = np.concatenate(([v['intensity'] for v in data[condition_labels[0]].values()] +
-                                       [v['intensity'] for v in data[condition_labels[1]].values()]), axis=None)
-    global_dwell = np.concatenate(([v['dwell'] for v in data[condition_labels[0]].values()] +
-                                   [v['dwell'] for v in data[condition_labels[1]].values()]), axis=None)
-    global_dwell = np.log10(global_dwell)
-
-    # Scale the intensity and dwell time arrays
-    X = StandardScaler().fit_transform([(i, d) for i, d in zip(global_intensity, global_dwell)])
-
-    # Generate an array of sample labels
-    Y = [k for k, v in data[condition_labels[0]].items() for _ in v['intensity']] + \
-        [k for k, v in data[condition_labels[1]].items() for _ in v['intensity']]
-
-    gmm_mod, gmm_type, gmm_ncomponents = fit_best_gmm(X, max_components=2, cv_types=['full'], random_state=random_state)
-
-    if gmm_ncomponents == 2:
-        # Assign data points to the clusters
-        y_pred = gmm_mod.predict(X)
-        counters = dict()
-        # Count how many reads in each cluster for each sample
-        for lab in sample_labels:
-            counters[lab] = Counter(y_pred[[i == lab for i in Y]])
-        cluster_counts = count_reads_in_cluster(counters)
-        if test == "anova":
-            pvalue, stat, details = gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings)
-        elif test == "logit":
-            pvalue, stat, details = gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels)
-    else:
-        pvalue = stat = details = cluster_counts = None
-
-    return {"model": gmm_mod, "cluster_counts": cluster_counts, "pvalue": pvalue, "test_stat": stat,
-            "test_details": details}
-
-
-def fit_best_gmm(X, random_state, max_components=2, cv_types=['spherical', 'tied', 'diag', 'full']):
-   # Loop over multiple cv_types and n_components and for each fit a GMM
-    # calculate the BIC and retain the lowest
-    lowest_bic = np.infty
-    bic = []
-    n_components_range = range(1, max_components+1)
-    for cv_type in cv_types:
-        for n_components in n_components_range:
-        # Fit a Gaussian mixture with EM
-            gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=random_state)
-            gmm.fit(X)
-            bic.append(gmm.bic(X))
-            if bic[-1] < lowest_bic:
-                lowest_bic = bic[-1]
-                best_gmm = gmm
-                best_gmm_type = cv_type
-                best_gmm_ncomponents = n_components
-    return (best_gmm, best_gmm_type, best_gmm_ncomponents)
-
-
-def gmm_anova_test(counters, sample_condition_labels, condition_labels, gmm_ncomponents, allow_warnings=False):
-    labels = []
-    logr = []
-    for sample, counter in counters.items():
-        # Save the condition label the corresponds to the current sample
-        labels.append(sample_condition_labels[sample])
-        # The Counter dictionaries in counters are not ordered
-        # The following line enforces the order and adds 1 to avoid empty clusters
-        ordered_counter = [counter[i] + 1 for i in range(gmm_ncomponents)]
-        total = sum(ordered_counter)
-        normalised_ordered_counter = [i / total for i in ordered_counter]
-        # Loop through ordered_counter and divide each value by the first
-        logr.append(np.log(normalised_ordered_counter[0] / (1 - normalised_ordered_counter[0])))
-    logr = np.around(np.array(logr), decimals=9)
-    logr_s1 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[0]]
-    logr_s2 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[1]]
-    # If the SS for either array is 0, skip the anova test
-    if sum_of_squares(logr_s1 - np.mean(logr_s1)) == 0 and sum_of_squares(logr_s2 - np.mean(logr_s2)) == 0:
-        if not allow_warnings:
-            raise NanocomporeError("While doing the Anova test we found a sample with within variance = 0. Use --allow_warnings to ignore.")
+        for res_dict in results.values():
+            pval_list.append(res_dict.get(pvalue_key))
+            # Compute cross correlation matrix
+            corr_matrix = cross_corr_matrix(pval_list)
+
+        logger.debug("Combine adjacent position pvalues with Hou's method position by position")
+        combined_label = f"{pvalue_key}_context_{sequence_context}"
+        # Iterate over each position in previously generated result dictionary
+        for mid_pos, res_dict in results.items():
+            # If the mid p-value is NaN, also set the context p-value to NaN
+            if np.isnan(res_dict[pvalue_key]):
+                results[mid_pos][combined_label] = np.nan
+                continue
+            ## Otherwise collect adjacent p-values and combine them:
+            pval_list = []
+            for pos in range(mid_pos - self.__sequence_context, mid_pos + self.__sequence_context + 1):
+                # If any of the positions is missing or any of the p-values in the context is NaN, consider it 1
+                if (pos not in results) or np.isnan(results[pos][pvalue_key]):
+                    pval_list.append(1)
+                else: # just extract the corresponding pvalue
+                    pval_list.append(results[pos][pvalue_key])
+            # Combine collected pvalues and add to dict
+            results[mid_pos][combined_label] = self.__combine_pvalues_hou(pval_list, corr_matrix)
+
+
+    def __nonparametric_test(self, condition1_intensity, condition2_intensity,
+                             condition1_dwell, condition2_dwell):
+        if self.__univariate_test == "MW":
+            stat_test = lambda x, y: mannwhitneyu(x, y, alternative='two-sided')
+        elif self.__univariate_test == "KS":
+            stat_test = ks_twosamp
+        elif self.__univariate_test == "ST":
+            stat_test = lambda x, y: ttest_ind(x, y, equal_var=False)
+        else:
+            raise NanocomporeError("Invalid univariate test name (MW, KS, ST)")
+
+        pval_intensity = stat_test(condition1_intensity, condition2_intensity)[1]
+        if pval_intensity == 0:
+            pval_intensity = np.finfo(np.float).tiny
+
+        pval_dwell = stat_test(condition1_dwell, condition2_dwell)[1]
+        if pval_dwell == 0:
+            pval_dwell = np.finfo(np.float).tiny
+        return (pval_intensity, pval_dwell)
+
+
+    def __gmm_fit(self, data):
+        # Condition labels
+        condition_labels = tuple(data.keys())
+        # List of sample labels
+        sample_labels = list(data[condition_labels[0]].keys()) + list(data[condition_labels[1]].keys())
+
+        if len(sample_labels) != len(set(sample_labels)):
+            raise NanocomporeError("Sample labels have to be unique and it looks like some are not.")
+
+        # Dictionary Sample_label:Condition_label
+        sample_condition_labels = {sk:k for k,v in data.items() for sk in v.keys()}
+        if len(condition_labels) != 2:
+            raise NanocomporeError("GMM fitting only supports two conditions")
+
+        # Merge the intensities and dwell times of all samples in a single array
+        global_intensity = np.concatenate(([v['intensity'] for v in data[condition_labels[0]].values()] +
+                                           [v['intensity'] for v in data[condition_labels[1]].values()]), axis=None)
+        global_dwell = np.concatenate(([v['dwell'] for v in data[condition_labels[0]].values()] +
+                                       [v['dwell'] for v in data[condition_labels[1]].values()]), axis=None)
+        global_dwell = np.log10(global_dwell)
+
+        # Scale the intensity and dwell time arrays
+        X = StandardScaler().fit_transform([(i, d) for i, d in zip(global_intensity, global_dwell)])
+
+        # Generate an array of sample labels
+        Y = [k for k, v in data[condition_labels[0]].items() for _ in v['intensity']] + \
+            [k for k, v in data[condition_labels[1]].items() for _ in v['intensity']]
+
+        gmm_mod, gmm_type, gmm_ncomponents = self.__fit_best_gmm(X, max_components=2, cv_types=['full'])
+
+        if gmm_ncomponents == 2:
+            # Assign data points to the clusters
+            y_pred = gmm_mod.predict(X)
+            counters = dict()
+            # Count how many reads in each cluster for each sample
+            for lab in sample_labels:
+                counters[lab] = Counter(y_pred[[i == lab for i in Y]])
+            cluster_counts = self.__count_reads_in_cluster(counters)
+            if self.__gmm_test == "anova":
+                pvalue, stat, details = self.__gmm_anova_test(counters, sample_condition_labels,
+                                                              condition_labels, gmm_ncomponents)
+            elif self.__gmm_test == "logit":
+                pvalue, stat, details = self.__gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels)
         else:
-            aov_table = "Within variance is 0"
-            aov_pvalue = np.finfo(np.float).tiny
-    else:
+            pvalue = stat = details = cluster_counts = None
+
+        return {"model": gmm_mod, "cluster_counts": cluster_counts, "pvalue": pvalue, "test_stat": stat,
+                "test_details": details}
+
+
+    def __fit_best_gmm(self, X, max_components=2, cv_types=['spherical', 'tied', 'diag', 'full']):
+       # Loop over multiple cv_types and n_components and for each fit a GMM
+        # calculate the BIC and retain the lowest
+        lowest_bic = np.infty
+        bic = []
+        n_components_range = range(1, max_components + 1)
+        for cv_type in cv_types:
+            for n_components in n_components_range:
+            # Fit a Gaussian mixture with EM
+                gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type,
+                                      random_state=self.__random_state)
+                gmm.fit(X)
+                bic.append(gmm.bic(X))
+                if bic[-1] < lowest_bic:
+                    lowest_bic = bic[-1]
+                    best_gmm = gmm
+                    best_gmm_type = cv_type
+                    best_gmm_ncomponents = n_components
+        return (best_gmm, best_gmm_type, best_gmm_ncomponents)
+
+
+    def __gmm_anova_test(self, counters, sample_condition_labels, condition_labels, gmm_ncomponents):
+        labels = []
+        logr = []
+        for sample, counter in counters.items():
+            # Save the condition label the corresponds to the current sample
+            labels.append(sample_condition_labels[sample])
+            # The Counter dictionaries in counters are not ordered
+            # The following line enforces the order and adds 1 to avoid empty clusters
+            ordered_counter = [counter[i] + 1 for i in range(gmm_ncomponents)]
+            total = sum(ordered_counter)
+            normalised_ordered_counter = [i / total for i in ordered_counter]
+            # Loop through ordered_counter and divide each value by the first
+            logr.append(np.log(normalised_ordered_counter[0] / (1 - normalised_ordered_counter[0])))
+        logr = np.around(np.array(logr), decimals=9)
+        logr_s1 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[0]]
+        logr_s2 = [logr[i] for i, l in enumerate(labels) if l == condition_labels[1]]
+        # If the SS for either array is 0, skip the anova test
+        if sum_of_squares(logr_s1 - np.mean(logr_s1)) == 0 and sum_of_squares(logr_s2 - np.mean(logr_s2)) == 0:
+            if not self.__allow_anova_warnings:
+                raise NanocomporeError("While doing the Anova test we found a sample with within variance = 0. Use --allow_anova_warnings to ignore.")
+            else:
+                aov_table = "Within variance is 0"
+                aov_pvalue = np.finfo(np.float).tiny
+        else:
+            with warnings.catch_warnings():
+                # Convert warnings to errors in order to catch them
+                warnings.filterwarnings('error')
+                try:
+                    aov_table = f_oneway(logr_s1, logr_s2)
+                    aov_pvalue = aov_table.pvalue
+                except RuntimeWarning:
+                    if not self.__allow_anova_warnings:
+                        raise NanocomporeError("While doing the Anova test a runtime warning was raised. Use --allow_anova_warnings to ignore.")
+                    else:
+                        warnings.filterwarnings('default')
+                        aov_table = f_oneway(logr_s1, logr_s2)
+                        aov_pvalue = np.finfo(np.float).tiny
+        if aov_pvalue == 0:
+            raise NanocomporeError("The Anova test returned a p-value of 0. This is most likely an error somewhere")
+        # Calculate the delta log odds ratio, i.e. the difference of the means of the log odds ratios between the two conditions
+        aov_delta_logit = float(np.mean(logr_s1) - np.mean(logr_s2))
+        aov_details = {'table': aov_table, 'log_ratios': logr}
+        return (aov_pvalue, aov_delta_logit, aov_details)
+
+
+    @staticmethod
+    def __gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
+        Y = [sample_condition_labels[i] for i in Y]
+        y_pred = np.append(y_pred, [0, 0, 1, 1])
+        Y.extend([condition_labels[0], condition_labels[1], condition_labels[0], condition_labels[1]])
+        Y = pd.get_dummies(Y)
+        Y['intercept'] = 1
+        logit = dm.Logit(y_pred, Y[['intercept', condition_labels[1]]])
         with warnings.catch_warnings():
-            # Convert warnings to errors in order to catch them
             warnings.filterwarnings('error')
             try:
-                aov_table = f_oneway(logr_s1, logr_s2)
-                aov_pvalue = aov_table.pvalue
-            except RuntimeWarning:
-                if not allow_warnings:
-                    raise NanocomporeError("While doing the Anova test a runtime warning was raised. Use --allow_warnings to ignore.")
-                else:
-                    warnings.filterwarnings('default')
-                    aov_table = f_oneway(logr_s1, logr_s2)
-                    aov_pvalue = np.finfo(np.float).tiny
-    if aov_pvalue == 0:
-        raise NanocomporeError("The Anova test returned a p-value of 0. This is most likely an error somewhere")
-    # Calculate the delta log odds ratio, i.e. the difference of the means of the log odds ratios between the two conditions
-    aov_delta_logit = float(np.mean(logr_s1) - np.mean(logr_s2))
-    aov_details = {'table': aov_table, 'log_ratios': logr}
-    return (aov_pvalue, aov_delta_logit, aov_details)
-
-
-def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
-    Y = [ sample_condition_labels[i] for i in Y]
-    y_pred=np.append(y_pred, [0,0,1,1])
-    Y.extend([condition_labels[0], condition_labels[1], condition_labels[0], condition_labels[1]])
-    Y = pd.get_dummies(Y)
-    Y['intercept']=1
-    logit = dm.Logit(y_pred,Y[['intercept', condition_labels[1]]] )
-    with warnings.catch_warnings():
-        warnings.filterwarnings('error')
-        try:
-            logit_mod = logit.fit(disp=0)
-            logit_pvalue, logit_coef = logit_mod.pvalues[1], logit_mod.params[1]
-        except ConvergenceWarning:
-            logit_mod, logit_pvalue, logit_coef = None, 1, None
-    if logit_pvalue == 0:
-        logit_pvalue = np.finfo(np.float).tiny
-    logit_details = {'model': logit_mod}
-    return (logit_pvalue, logit_coef, logit_details)
-
-
-def count_reads_in_cluster(counters):
-    cluster_counts = list()
-    for k, v in counters.items():
-        cluster_counts.append("%s:%s/%s" % (k, v[0], v[1]))
-    cluster_counts="__".join(cluster_counts)
-    return cluster_counts
-
-
-def shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell):
-    """Calculate shift statistics"""
-    shift_stats = OrderedDict([
-        ('c1_mean_intensity', np.mean(condition1_intensity)),
-        ('c2_mean_intensity', np.mean(condition2_intensity)),
-        ('c1_median_intensity', np.median(condition1_intensity)),
-        ('c2_median_intensity', np.median(condition2_intensity)),
-        ('c1_sd_intensity', np.std(condition1_intensity)),
-        ('c2_sd_intensity', np.std(condition2_intensity)),
-        ('c1_mean_dwell', np.mean(condition1_dwell)),
-        ('c2_mean_dwell', np.mean(condition2_dwell)),
-        ('c1_median_dwell', np.median(condition1_dwell)),
-        ('c2_median_dwell', np.median(condition2_dwell)),
-        ('c1_sd_dwell', np.std(condition1_dwell)),
-        ('c2_sd_dwell', np.std(condition2_dwell))
-    ])
-    return shift_stats
-
-
-def cross_corr_matrix(pvalues_vector, context=2):
-    """ Calculate the cross correlation matrix of the
-        pvalues for a given context.
-    """
-    if len(pvalues_vector) < (context * 3) + 3:
-        raise NanocomporeError("Not enough p-values for a context of order %s" % context)
-
-    pvalues_vector = np.array([i if not np.isnan(i) else 1 for i in pvalues_vector])
-    if any(pvalues_vector == 0) or any(np.isinf(pvalues_vector)) or any(pvalues_vector > 1):
-        raise NanocomporeError("At least one p-value is invalid")
-
-    matrix = []
-    s = pvalues_vector.size
-    if all(p == 1 for p in pvalues_vector):
-        return np.ones((context * 2 + 1, context * 2 + 1))
-
-    for i in range(-context, context + 1):
-        row = []
-        for j in range(-context, context + 1):
-            row.append(np.corrcoef((np.roll(pvalues_vector, i)[context:s - context]),
-                                   (np.roll(pvalues_vector, j)[context:s - context]))[0][1])
-        matrix.append(row)
-    return np.array(matrix)
-
-
-def combine_pvalues_hou(pvalues, weights, cor_mat):
-    """ Hou's method for the approximation for the distribution of the weighted
-        combination of non-independent or independent probabilities.
-        If any pvalue is nan, returns nan.
-        https://doi.org/10.1016/j.spl.2004.11.028
-        pvalues: list of pvalues to be combined
-        weights: the weights of the pvalues
-        cor_mat: a matrix containing the correlation coefficients between pvalues
-        Test: when weights are equal and cor=0, hou is the same as Fisher
-        print(combine_pvalues([0.1,0.02,0.1,0.02,0.3], method='fisher')[1])
-        print(hou([0.1,0.02,0.1,0.02,0.3], [1,1,1,1,1], np.zeros((5,5))))
-    """
-    if(len(pvalues) != len(weights)):
-        raise NanocomporeError("Can't combine pvalues is pvalues and weights are not the same length.")
-    if( cor_mat.shape[0] != cor_mat.shape[1] or cor_mat.shape[0] != len(pvalues)):
-        raise NanocomporeError("The correlation matrix needs to be squared, with each dimension equal to the length of the pvalued vector.")
-    if all(p == 1 for p in pvalues):
-        return 1
-    if any((p == 0 or np.isinf(p) or p > 1) for p in pvalues):
-        raise NanocomporeError("At least one p-value is invalid")
-
-    # Covariance estimation as in Kost and McDermott (eq:8)
-    # https://doi.org/10.1016/S0167-7152(02)00310-3
-    cov = lambda r: (3.263*r)+(0.710*r**2)+(0.027*r**3)
-    k = len(pvalues)
-    cov_sum = np.float64(0)
-    sw_sum = np.float64(0)
-    w_sum = np.float64(0)
-    tau = np.float64(0)
-    for i in range(k):
-        for j in range(i + 1, k):
-            cov_sum += weights[i] * weights[j] * cov(cor_mat[i][j])
-        sw_sum += weights[i]**2
-        w_sum += weights[i]
-        # Calculate the weighted Fisher's combination statistic
-        tau += weights[i] * (-2 * np.log(pvalues[i]))
-    # Correction factor
-    c = (2 * sw_sum + cov_sum) / (2 * w_sum)
-    # Degrees of freedom
-    f = (4 * w_sum**2) / (2 * sw_sum + cov_sum)
-    # chi2.sf is the same as 1-chi2.cdf but is more accurate
-    combined_p_value = chi2.sf(tau/c, f)
-    # Return a very small number if pvalue = 0
-    if combined_p_value == 0:
-        combined_p_value = np.finfo(np.float).tiny
-    return combined_p_value
-
-
-def harmonic_series(sequence_context):
-    weights = []
-    for i in range(-sequence_context, sequence_context + 1):
-        weights.append(1 / (abs(i) + 1))
-    return weights
-
-
-def sum_of_squares(x):
-    """
-    Square each element of the input array and return the sum
-    """
-    x = np.atleast_1d(x)
-    return np.sum(x * x)
-
-
-def has_low_coverage(pos_dict, min_coverage):
-    for cond_dict in pos_dict.values():
-        for sample_val in cond_dict.values():
-            if sample_val["coverage"] < min_coverage:
-                return True
-    return False
+                logit_mod = logit.fit(disp=0)
+                logit_pvalue, logit_coef = logit_mod.pvalues[1], logit_mod.params[1]
+            except ConvergenceWarning:
+                logit_mod, logit_pvalue, logit_coef = None, 1, None
+        if logit_pvalue == 0:
+            logit_pvalue = np.finfo(np.float).tiny
+        logit_details = {'model': logit_mod}
+        return (logit_pvalue, logit_coef, logit_details)
+
+
+    @staticmethod
+    def __count_reads_in_cluster(counters):
+        cluster_counts = list()
+        for k, v in counters.items():
+            cluster_counts.append("%s:%s/%s" % (k, v[0], v[1]))
+        cluster_counts = "_".join(cluster_counts)
+        return cluster_counts
+
+
+    @staticmethod
+    def __shift_stats(condition1_intensity, condition2_intensity, condition1_dwell, condition2_dwell):
+        """Calculate shift statistics"""
+        shift_stats = OrderedDict([
+            ('c1_mean_intensity', np.mean(condition1_intensity)),
+            ('c2_mean_intensity', np.mean(condition2_intensity)),
+            ('c1_median_intensity', np.median(condition1_intensity)),
+            ('c2_median_intensity', np.median(condition2_intensity)),
+            ('c1_sd_intensity', np.std(condition1_intensity)),
+            ('c2_sd_intensity', np.std(condition2_intensity)),
+            ('c1_mean_dwell', np.mean(condition1_dwell)),
+            ('c2_mean_dwell', np.mean(condition2_dwell)),
+            ('c1_median_dwell', np.median(condition1_dwell)),
+            ('c2_median_dwell', np.median(condition2_dwell)),
+            ('c1_sd_dwell', np.std(condition1_dwell)),
+            ('c2_sd_dwell', np.std(condition2_dwell))
+        ])
+        return shift_stats
+
+
+    def __cross_corr_matrix(self, pvalues_vector):
+        """Calculate the cross correlation matrix of the pvalues for a given context."""
+        context = self.__sequence_context
+        if len(pvalues_vector) < (context * 3) + 3:
+            raise NanocomporeError(f"Not enough p-values for a context of {context}")
+
+        pvalues_vector = np.array([i if not np.isnan(i) else 1 for i in pvalues_vector])
+        if any(pvalues_vector == 0) or any(np.isinf(pvalues_vector)) or any(pvalues_vector > 1):
+            raise NanocomporeError("At least one p-value is invalid")
+
+        matrix = []
+        s = pvalues_vector.size
+        if all(p == 1 for p in pvalues_vector):
+            return np.ones((context * 2 + 1, context * 2 + 1))
+
+        for i in range(-context, context + 1):
+            row = []
+            for j in range(-context, context + 1):
+                row.append(np.corrcoef((np.roll(pvalues_vector, i)[context:s - context]),
+                                       (np.roll(pvalues_vector, j)[context:s - context]))[0][1])
+            matrix.append(row)
+        return np.array(matrix)
+
+
+    def __combine_pvalues_hou(self, pvalues, cor_mat):
+        """ Hou's method for the approximation for the distribution of the weighted
+            combination of non-independent or independent probabilities.
+            If any pvalue is nan, returns nan.
+            https://doi.org/10.1016/j.spl.2004.11.028
+            pvalues: list of pvalues to be combined
+            cor_mat: a matrix containing the correlation coefficients between pvalues
+            Test: when weights are equal and cor=0, hou is the same as Fisher
+            print(combine_pvalues([0.1,0.02,0.1,0.02,0.3], method='fisher')[1])
+            print(hou([0.1,0.02,0.1,0.02,0.3], [1,1,1,1,1], np.zeros((5,5))))
+        """
+        weights = self.__sequence_context_weights
+        # TODO: are the following sanity checks necessary/useful?
+        if len(pvalues) != len(weights):
+            raise NanocomporeError("Can't combine pvalues if pvalues and weights are not the same length.")
+        if cor_mat.shape[0] != cor_mat.shape[1] or cor_mat.shape[0] != len(pvalues):
+            raise NanocomporeError("The correlation matrix needs to be square, with each dimension equal to the length of the pvalued vector.")
+        if all(p == 1 for p in pvalues):
+            return 1
+        if any((p == 0 or np.isinf(p) or p > 1) for p in pvalues):
+            raise NanocomporeError("At least one p-value is invalid")
+
+        # Covariance estimation as in Kost and McDermott (eq:8)
+        # https://doi.org/10.1016/S0167-7152(02)00310-3
+        cov = lambda r: (3.263*r)+(0.710*r**2)+(0.027*r**3)
+        k = len(pvalues)
+        cov_sum = np.float64(0)
+        sw_sum = np.float64(0)
+        w_sum = np.float64(0)
+        tau = np.float64(0)
+        for i in range(k):
+            for j in range(i + 1, k):
+                cov_sum += weights[i] * weights[j] * cov(cor_mat[i][j])
+            sw_sum += weights[i]**2
+            w_sum += weights[i]
+            # Calculate the weighted Fisher's combination statistic
+            tau += weights[i] * (-2 * np.log(pvalues[i]))
+        # Correction factor
+        c = (2 * sw_sum + cov_sum) / (2 * w_sum)
+        # Degrees of freedom
+        f = (4 * w_sum**2) / (2 * sw_sum + cov_sum)
+        # chi2.sf is the same as 1 - chi2.cdf but is more accurate
+        combined_p_value = chi2.sf(tau / c, f)
+        # Return a very small number if pvalue = 0
+        if combined_p_value == 0:
+            combined_p_value = np.finfo(np.float).tiny
+        return combined_p_value
+
+
+    def __harmonic_series(self):
+        weights = []
+        for i in range(-self.__sequence_context, self.__sequence_context + 1):
+            weights.append(1 / (abs(i) + 1))
+        return weights
+
+
+    @staticmethod
+    def __sum_of_squares(x):
+        """
+        Square each element of the input array and return the sum
+        """
+        x = np.atleast_1d(x)
+        return np.sum(x * x)
+
+
+    def __has_low_coverage(self, pos_dict):
+        for cond_dict in pos_dict.values():
+            for sample_val in cond_dict.values():
+                if sample_val["coverage"] < self.__min_coverage:
+                    return True
+        return False

From 3ff803818e870b1233dbed436384ba8e9f4422bf Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 16:32:08 +0100
Subject: [PATCH 33/49] DataStore: small bug fixes (add 'self' for method
 calls)

---
 nanocompore/DataStore.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 2a91f5a..35e8516 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -343,15 +343,15 @@ def __init__(self,
         self.__with_gmm = with_gmm
         self.__with_sequence_context = with_sequence_context
         if with_gmm:
-            table_defs["gmm_stats"] = table_def_gmm_stats
+            self.table_defs["gmm_stats"] = self.table_def_gmm_stats
         if with_sequence_context: # add additional columns for context p-values
-            table_defs["kmer_stats"] += ["intensity_pvalue_context REAL",
-                                         "dwell_pvalue_context REAL",
-                                         "adj_intensity_pvalue_context REAL",
-                                         "adj_dwell_pvalue_context REAL"]
+            self.table_defs["kmer_stats"] += ["intensity_pvalue_context REAL",
+                                              "dwell_pvalue_context REAL",
+                                              "adj_intensity_pvalue_context REAL",
+                                              "adj_dwell_pvalue_context REAL"]
             if with_gmm:
-                table_defs["gmm_stats"] += ["test_pvalue_context REAL",
-                                            "adj_test_pvalue_context REAL"]
+                self.table_defs["gmm_stats"] += ["test_pvalue_context REAL",
+                                                 "adj_test_pvalue_context REAL"]
 
 
     def __insert_transcript_get_id(self, tx_name):

From 6270cdc1c636bb8cfc03068f808da712e0510058 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 16:32:48 +0100
Subject: [PATCH 34/49] SampComp: use new 'TxComp' class, simplify parameter
 handling

---
 nanocompore/SampComp.py | 57 +++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index a6854d9..bbbc7ae 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -19,7 +19,7 @@
 from nanocompore.common import *
 from nanocompore.DataStore import *
 from nanocompore.Whitelist import Whitelist
-from nanocompore.TxComp import txCompare
+from nanocompore.TxComp import TxComp
 from nanocompore.SampCompDB import SampCompDB
 import nanocompore as pkg
 
@@ -47,9 +47,9 @@ def __init__(self,
                  univariate_test:str = "KS", # or: "MW", "ST"
                  fit_gmm:bool = True,
                  gmm_test:str = "logit", # or: "anova"
-                 allow_warnings:bool = False,
+                 allow_anova_warnings:bool = False,
                  sequence_context:int = 0,
-                 sequence_context_weights:str = "uniform",
+                 sequence_context_weighting:str = "uniform",
                  min_coverage:int = 30,
                  min_ref_length:int = 100,
                  downsample_high_coverage:int = 5000,
@@ -85,12 +85,12 @@ def __init__(self,
             Fit a Gaussian mixture model (GMM) to the intensity/dwell-time distribution?
         * gmm_test
             Method to compare samples based on the GMM ('logit' or 'anova'), or empty for no comparison.
-        * allow_warnings
+        * allow_anova_warnings
             If True runtime warnings during the ANOVA tests don't raise an error.
         * sequence_context
             Extend statistical analysis to contiguous adjacent bases if available.
-        * sequence_context_weights
-            type of weights to used for combining p-values. {uniform,harmonic}
+        * sequence_context_weighting
+            type of weighting to used for combining p-values. {uniform,harmonic}
         * min_coverage
             minimal read coverage required in all sample.
         * min_ref_length
@@ -141,9 +141,9 @@ def __init__(self,
             raise NanocomporeError("Whitelist is not valid")
 
         self.__output_db_path = output_db_path
+        self.__db_args = {"with_gmm": fit_gmm, "with_sequence_context": (sequence_context > 0)}
         db_create_mode = DBCreateMode.OVERWRITE if overwrite else DBCreateMode.CREATE_MAYBE
-        db = DataStore_SampComp(output_db_path, db_create_mode, with_gmm=fit_gmm,
-                                with_sequence_context=(sequence_context > 0))
+        db = DataStore_SampComp(output_db_path, db_create_mode, **self.__db_args)
         with db:
             db.store_whitelist(whitelist)
         # TODO: move this to '__call__'?
@@ -158,12 +158,6 @@ def __init__(self,
         self.__sample_dict = sample_dict
         self.__fasta_fn = fasta_fn
         self.__whitelist = whitelist
-        self.__univariate_test = univariate_test
-        self.__fit_gmm = fit_gmm
-        self.__gmm_test = gmm_test
-        self.__allow_warnings = allow_warnings
-        self.__sequence_context = sequence_context
-        self.__sequence_context_weights = sequence_context_weights
         self.__nthreads = nthreads - 2
         self.__progress = progress
 
@@ -172,6 +166,20 @@ def __init__(self,
         for samples in sample_dict.values():
             self.__n_samples += len(samples)
 
+        # If statistical tests are requested, initialise the "TxComp" object:
+        if univariate_test or fit_gmm:
+            random_state = np.random.RandomState(seed=42)
+            self.__tx_compare = TxComp(random_state,
+                                       univariate_test=univariate_test,
+                                       fit_gmm=fit_gmm,
+                                       gmm_test=gmm_test,
+                                       sequence_context=sequence_context,
+                                       sequence_context_weighting=sequence_context_weighting,
+                                       min_coverage=self.__min_coverage,
+                                       allow_anova_warnings=allow_anova_warnings)
+        else:
+            self.__tx_compare = None
+
 
     def __call__(self):
         """
@@ -258,18 +266,9 @@ def process_transcript(self, tx_id, whitelist_reads):
 
         logger.debug(f"Data loaded for transcript: {tx_id}")
         test_results = {}
-        if univariate_test or fit_gmm:
-            random_state = np.random.RandomState(seed=42)
-            test_results = txCompare(tx_id,
-                                     kmer_data,
-                                     random_state=random_state,
-                                     univariate_test=self.__univariate_test,
-                                     fit_gmm=self.__fit_gmm,
-                                     gmm_test=self.__gmm_test,
-                                     sequence_context=self.__sequence_context,
-                                     sequence_context_weights=self.__sequence_context_weights,
-                                     min_coverage= self.__min_coverage,
-                                     allow_warnings=self.__allow_warnings)
+        if self.__tx_compare:
+            test_results = self.__tx_compare(tx_id, kmer_data)
+            # TODO: check "gmm_anova_failed" state of TxComp object
 
         # Remove 'default_factory' functions from 'kmer_data' to enable pickle/multiprocessing
         kmer_data.default_factory = None
@@ -339,10 +338,8 @@ def __write_output_to_db(self, out_q, error_q):
         n_tx = 0
         try:
             # Database was already created earlier to store the whitelist!
-            db = DataStore_SampComp(self.__output_db_path, DBCreateMode.MUST_EXIST,
-                                    with_gmm=self.__fit_gmm,
-                                    with_sequence_context=(self.__sequence_context > 0))
-            with  as db:
+            db = DataStore_SampComp(self.__output_db_path, DBCreateMode.MUST_EXIST, **self.__db_args)
+            with db:
                 # Iterate over the counter queue and process items until all poison pills are found
                 for _ in range(self.__nthreads):
                     for ref_id, kmer_data, test_results in iter(out_q.get, None):

From bd3499bcb2ed9d751d88a90e7ffdf5f1ec237b1e Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 21 Jul 2021 17:37:32 +0100
Subject: [PATCH 35/49] TxComp/DataStore: bug fixes (use of 'sequence_context')

---
 nanocompore/DataStore.py | 21 ++++++++++++++-------
 nanocompore/TxComp.py    | 15 ++++++++-------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 35e8516..47fcba5 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -345,13 +345,20 @@ def __init__(self,
         if with_gmm:
             self.table_defs["gmm_stats"] = self.table_def_gmm_stats
         if with_sequence_context: # add additional columns for context p-values
-            self.table_defs["kmer_stats"] += ["intensity_pvalue_context REAL",
+            # column definitions must go before table constraints!
+            constraints = self.table_defs["kmer_stats"][-2:]
+            self.table_defs["kmer_stats"] = (self.table_defs["kmer_stats"][:-2] +
+                                             ["intensity_pvalue_context REAL",
                                               "dwell_pvalue_context REAL",
                                               "adj_intensity_pvalue_context REAL",
-                                              "adj_dwell_pvalue_context REAL"]
+                                              "adj_dwell_pvalue_context REAL"] +
+                                             constraints)
             if with_gmm:
-                self.table_defs["gmm_stats"] += ["test_pvalue_context REAL",
-                                                 "adj_test_pvalue_context REAL"]
+                constraints = self.table_defs["gmm_stats"][-1:]
+                self.table_defs["gmm_stats"] = (self.table_defs["gmm_stats"][:-1] +
+                                                ["test_pvalue_context REAL",
+                                                 "adj_test_pvalue_context REAL"] +
+                                                constraints)
 
 
     def __insert_transcript_get_id(self, tx_name):
@@ -388,10 +395,10 @@ def store_test_results(self, tx_name, test_results):
             kmer_statsid = self._cursor.lastrowid
             if self.__with_gmm:
                 # insert 'None' (NULL) into adj. p-value columns:
-                values = [kmer_statsid, res["gmm_model"].n_components, res["gmm_cluster_counts"],
-                          res["gmm_test_stat"], res["gmm_pvalue"], None]
+                values = [kmer_statsid, res["gmm_model"].n_components, res.get("gmm_cluster_counts"),
+                          res.get("gmm_test_stat"), res.get("gmm_pvalue"), None]
                 if self.__with_sequence_context:
-                    values += [res["gmm_pvalue_context"], None]
+                    values += [res.get("gmm_pvalue_context"), None]
                 qmarks = ", ".join(["?"] * len(values))
                 try:
                     self._cursor.execute(f"INSERT INTO gmm_stats VALUES ({qmarks})", values)
diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py
index 230cd32..4565b4d 100644
--- a/nanocompore/TxComp.py
+++ b/nanocompore/TxComp.py
@@ -41,7 +41,7 @@ def __init__(self,
         if sequence_context > 0:
             if sequence_context_weighting == "harmonic":
                 # Generate weights as a symmetrical harmonic series
-                self.__sequence_context_weights = harmonic_series(self.__sequence_context)
+                self.__sequence_context_weights = self.__harmonic_series()
             elif sequence_context_weighting == "uniform":
                 self.__sequence_context_weights = [1] * (2 * self.__sequence_context + 1)
             else:
@@ -128,23 +128,24 @@ def __combine_adjacent_pvalues(self, results, pvalue_key):
         # Collect pvalue list for test
         pval_list = []
         for res_dict in results.values():
+            # TODO: avoid 'None'/'np.nan' checks below by checking and replacing here?
             pval_list.append(res_dict.get(pvalue_key))
-            # Compute cross correlation matrix
-            corr_matrix = cross_corr_matrix(pval_list)
+        # Compute cross correlation matrix
+        corr_matrix = self.__cross_corr_matrix(pval_list)
 
         logger.debug("Combine adjacent position pvalues with Hou's method position by position")
-        combined_label = f"{pvalue_key}_context_{sequence_context}"
+        combined_label = f"{pvalue_key}_context"
         # Iterate over each position in previously generated result dictionary
         for mid_pos, res_dict in results.items():
             # If the mid p-value is NaN, also set the context p-value to NaN
-            if np.isnan(res_dict[pvalue_key]):
+            if (res_dict[pvalue_key] is None) or np.isnan(res_dict[pvalue_key]):
                 results[mid_pos][combined_label] = np.nan
                 continue
             ## Otherwise collect adjacent p-values and combine them:
             pval_list = []
             for pos in range(mid_pos - self.__sequence_context, mid_pos + self.__sequence_context + 1):
                 # If any of the positions is missing or any of the p-values in the context is NaN, consider it 1
-                if (pos not in results) or np.isnan(results[pos][pvalue_key]):
+                if (pos not in results) or (results[pos][pvalue_key] is None) or np.isnan(results[pos][pvalue_key]):
                     pval_list.append(1)
                 else: # just extract the corresponding pvalue
                     pval_list.append(results[pos][pvalue_key])
@@ -345,7 +346,7 @@ def __cross_corr_matrix(self, pvalues_vector):
         if len(pvalues_vector) < (context * 3) + 3:
             raise NanocomporeError(f"Not enough p-values for a context of {context}")
 
-        pvalues_vector = np.array([i if not np.isnan(i) else 1 for i in pvalues_vector])
+        pvalues_vector = np.array([i if (i is not None) and not np.isnan(i) else 1 for i in pvalues_vector])
         if any(pvalues_vector == 0) or any(np.isinf(pvalues_vector)) or any(pvalues_vector > 1):
             raise NanocomporeError("At least one p-value is invalid")
 

From 39c5009b7e6f082e4cdf9dfc63db3bd7055c3ba9 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 23 Jul 2021 15:37:54 +0100
Subject: [PATCH 36/49] SampComp: add multiple testing correction, remove
 'shelve' export

---
 nanocompore/SampComp.py | 113 +++++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 49 deletions(-)

diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index bbbc7ae..2575b1c 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -14,6 +14,7 @@
 from tqdm import tqdm
 import numpy as np
 from pyfaidx import Fasta
+from statsmodels.stats.multitest import multipletests
 
 # Local package
 from nanocompore.common import *
@@ -179,6 +180,10 @@ def __init__(self,
                                        allow_anova_warnings=allow_anova_warnings)
         else:
             self.__tx_compare = None
+        ## used to adjust p-values:
+        self.__univariate_test = univariate_test
+        self.__gmm_test = gmm_test if fit_gmm else ""
+        self.__sequence_context = (sequence_context > 0)
 
 
     def __call__(self):
@@ -227,6 +232,14 @@ def __call__(self):
                 logger.error("An error occured while trying to kill processes\n")
             raise E
 
+        # Adjust p-values for multiple testing:
+        if self.__univariate_test or self.__gmm_test:
+            logger.info("Running multiple testing correction")
+            self.__adjust_pvalues()
+            # context-based p-values are not independent tests, so adjust them separately:
+            if self.__sequence_context:
+                self.__adjust_pvalues(sequence_context=True)
+
 
     def process_transcript(self, tx_id, whitelist_reads):
         """Process a transcript given filtered reads from Whitelist"""
@@ -355,52 +368,54 @@ def __write_output_to_db(self, out_q, error_q):
             error_q.put(None)
 
 
-    def __write_output(self, out_q, error_q):
-        # Get results out of the out queue and write in shelve
-        pvalue_tests = set()
-        ref_id_list = []
-        n_tx = n_pos = 0
-        try:
-            with shelve.open(self.__db_fn, flag='n') as db, \
-                 tqdm(total=len(self.__whitelist), unit=" Processed References",
-                      disable=not self.__progress) as pbar:
-                # Iterate over the counter queue and process items until all poison pills are found
-                for _ in range(self.__nthreads):
-                    for ref_id, kmer_data, test_results in iter(out_q.get, None):
-                        ref_id_list.append(ref_id)
-                        logger.debug("Writer thread writing %s"%ref_id)
-                        # Get pvalue fields available in analysed data
-                        for res_dict in test_results.values():
-                            for res in res_dict.keys():
-                                if "pvalue" in res:
-                                    n_pos += 1
-                                    pvalue_tests.add(res)
-                        # Write results in a shelve db
-                        db[ref_id] = (kmer_data, test_results)
-                        pbar.update(1)
-                        n_tx += 1
-
-                # Write list of refid
-                db["__ref_id_list"] = ref_id_list
-
-                # Write metadata
-                db["__metadata"] = {
-                    "package_name": pkg.__version__,
-                    "package_version": pkg.__name__,
-                    "timestamp": str(datetime.datetime.now()),
-                    "comparison_methods": self.__comparison_methods,
-                    "pvalue_tests": sorted(list(pvalue_tests)),
-                    "sequence_context": self.__sequence_context,
-                    "min_coverage": self.__min_coverage,
-                    "n_samples": self.__n_samples}
-
-        # Manage exceptions and add error trackback to error queue
-        except Exception:
-            logger.error("Error in Writer")
-            error_q.put(traceback.format_exc())
-
-        finally:
-            logger.debug(f"Wrote {n_tx} transcripts, {n_pos} valid positions")
-            logger.info(f"All done. Transcripts processed: {n_tx}")
-            # Kill error queue with poison pill
-            error_q.put(None)
+    # TODO: move this to 'DataStore_SampComp'?
+    def __adjust_pvalues(self, method="fdr_bh", sequence_context=False):
+        """Perform multiple testing correction of p-values and update database"""
+        db = DataStore_SampComp(self.__output_db_path, DBCreateMode.MUST_EXIST, **self.__db_args)
+        with db:
+            pvalues = []
+            index = []
+            # for "context-averaged" p-values, add a suffix to the column names:
+            col_suffix = "_context" if sequence_context else ""
+            if self.__univariate_test:
+                query = f"SELECT id, intensity_pvalue{col_suffix}, dwell_pvalue{col_suffix} FROM kmer_stats"
+                try:
+                    for row in db.cursor.execute(query):
+                        for pv_col in ["intensity_pvalue", "dwell_pvalue"]:
+                            pv_col += col_suffix
+                            pv = row[pv_col]
+                            # "multipletests" doesn't handle NaN values well, so skip those:
+                            if (pv is not None) and not np.isnan(pv):
+                                pvalues.append(pv)
+                                index.append({"table": "kmer_stats", "id_col": "id",
+                                              "id": row["id"], "pv_col": pv_col})
+                except:
+                    logger.error("Error reading p-values from table 'kmer_stats'")
+                    raise
+            if self.__gmm_test:
+                pv_col = "test_pvalue" + col_suffix
+                query = f"SELECT kmer_statsid, {pv_col} FROM gmm_stats WHERE {pv_col} IS NOT NULL"
+                try:
+                    for row in db.cursor.execute(query):
+                        pv = row[pv_col]
+                        # "multipletests" doesn't handle NaN values well, so skip those:
+                        if not np.isnan(pv): # 'None' (NULL) values have been excluded in the query
+                            pvalues.append(pv)
+                            index.append({"table": "gmm_stats", "id_col": "kmer_statsid",
+                                          "id": row["kmer_statsid"], "pv_col": pv_col})
+                except:
+                    logger.error("Error reading p-values from table 'gmm_stats'")
+                    raise
+            logger.debug(f"Number of p-values for multiple testing correction: {len(pvalues)}")
+            if not pvalues:
+                return
+            adjusted = multipletests(pvalues, method=method)[1]
+            assert len(pvalues) == len(adjusted)
+            # sqlite module can't handle numpy float64 values, so convert to floats using "tolist":
+            for ind, adj_pv in zip(index, adjusted.tolist()):
+                query = "UPDATE {table} SET adj_{pv_col} = ? WHERE {id_col} = {id}".format_map(ind)
+                try:
+                    db.cursor.execute(query, (adj_pv, ))
+                except:
+                    logger.error("Error updating adjusted p-value for ID {id} in table '{table}'".format_map(ind))
+                    raise

From c6faa5e68497006997934e06f5715f0cfe11b267 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 23 Jul 2021 20:15:35 +0100
Subject: [PATCH 37/49] PostProcess: implement 'save_report' for SQLite data

---
 nanocompore/PostProcess.py | 149 ++++++++++++-------------------------
 1 file changed, 49 insertions(+), 100 deletions(-)

diff --git a/nanocompore/PostProcess.py b/nanocompore/PostProcess.py
index 92a73df..3a67c82 100644
--- a/nanocompore/PostProcess.py
+++ b/nanocompore/PostProcess.py
@@ -5,14 +5,17 @@
 from loguru import logger
 
 # Third party
-from statsmodels.stats.multitest import multipletests
+# ...
 
+# Local package
+from nanocompore.common import *
+from nanocompore.DataStore import DataStore_EventAlign, DataStore_SampComp
 
 #~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~#
 class PostProcess(object):
     """Helper class for post-processing `SampComp` results"""
 
-    def __init___(self, sampcomp_db_path:str, eventalign_db_path:str, bed_path:str=None):
+    def __init__(self, sampcomp_db_path:str, eventalign_db_path:str, bed_path:str=None):
         self._sampcomp_db_path = sampcomp_db_path
         self._eventalign_db_path = eventalign_db_path
         self._bed_path = bed_path
@@ -147,85 +150,53 @@ def save_report(self, output_fn:str=None, include_shift_stats:bool=True):
         else:
             raise NanocomporeError("output_fn needs to be a string or None")
 
-        shift_stat_columns = []
-        if include_shift_stats:
-            shift_stat_columns = ["c1_mean_intensity", "c2_mean_intensity",
-                                  "c1_median_intensity", "c2_median_intensity",
-                                  "c1_sd_intensity", "c2_sd_intensity",
-                                  "c1_mean_dwell", "c2_mean_dwell",
-                                  "c1_median_dwell", "c2_median_dwell",
-                                  "c1_sd_dwell", "c2_sd_dwell"]
-
         with DataStore_SampComp(self._sampcomp_db_path) as sc_db, \
              DataStore_EventAlign(self._eventalign_db_path) as ea_db:
-            # Which statistical tests were performed?
-            query = "SELECT DISTINCT test FROM univariate_results"
-            univar_tests = [row["test"] for row in sc_db.cursor.execute(query)]
-            query = "SELECT DISTINCT test FROM gmm_results"
-            gmm_tests = [row["test"] for row in sc_db.cursor.execute(query)]
-            # Generate headers
-            headers = ['pos', 'chr', 'genomicPos', 'ref_id', 'strand', 'ref_kmer']
-            for test in sorted(univar_tests):
-                headers += [f"{test}_dwell_pvalue", f"{test}_intensity_pvalue"]
-            if gmm_tests:
-                # TODO: what if GMM was fitted, but no test were performed?
-                headers += ["GMM_cov_type", "GMM_n_clust", "cluster_counts"]
-                if "logit" in gmm_tests:
-                    headers += ["GMM_logit_pvalue", "Logit_LOR"]
-                if "anova" in gmm_tests:
-                    headers += ["GMM_anova_pvalue", "Anova_delta_logit"]
-            # Write headers to file
-            fp.write('\t'.join([str(i) for i in headers]) + '\n')
-
-            # Merge kmer information with transcript name:
-            columns = ["kmer_stats.id", "transcriptid", "kmer AS pos", "name AS ref_id"] + shift_stat_columns
-            columns = ", ".join(columns)
-            query = f"SELECT {columns} FROM kmer_stats LEFT JOIN transcripts ON transcriptid = transcripts.id ORDER BY transcriptid, kmer"
+            # do we have GMM results?
+            query = "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = 'gmm_stats'"
+            sc_db.cursor.execute(query)
+            with_gmm = sc_db.cursor.fetchone() is not None
+            query = "SELECT * FROM kmer_stats LEFT JOIN transcripts ON transcriptid = transcripts.id"
+            if with_gmm:
+                query += " LEFT JOIN gmm_stats ON kmer_stats.id = gmm_stats.kmer_statsid"
+            query += " ORDER BY transcriptid, kmer"
+            first_row = True
+            shift_stat_columns = []
+            univariate_pvalue_columns = []
+            gmm_pvalue_columns = []
             for row in sc_db.cursor.execute(query):
-                db_data = dict(row)
-                # Get p-values etc.:
-                id = db_data["id"]
-                if univar_tests:
-                    query = f"SELECT test, intensity_pvalue, dwell_pvalue FROM univariate_results WHERE kmer_statsid = {id}"
-                    for row2 in sc_db.cursor.execute(query):
-                        test = row2["test"]
-                        db_data[test + "_intensity_pvalue"] = row2["intensity_pvalue"]
-                        db_data[test + "_dwell_pvalue"] = row2["dwell_pvalue"]
-                if gmm_tests:
-                    query = f"SELECT test, test_pvalue, test_stat FROM gmm_results WHERE gmm_statsid = {id}"
-                    for row2 in sc_db.cursor.execute(query):
-                        test = row2["test"]
-                        db_data[test + "_intensity_pvalue"] = row2["intensity_pvalue"]
-                        db_data[test + "_dwell_pvalue"] = row2["dwell_pvalue"]
-
-
-
-            # TODO: where does chromosome and genomic pos. information come from?
-
-
-        # We loop over the IDs so that ref_pos_list can be prefetched for each transcript
-        for cur_id in self.ref_id_list:
-            cur_ref_pos_list = self[cur_id]
-            for record in self.results[self.results.ref_id == cur_id ].itertuples():
-                if "GMM" in self._metadata["comparison_methods"]:
-                    record_txComp = cur_ref_pos_list[record.pos]['txComp']
-                line = []
-                for f in headers:
-                    if f in record._fields:
-                        line.append(getattr(record, f))
-                    elif f == "GMM_cov_type":
-                        line.append(record_txComp['GMM_model']['model'].covariance_type)
-                    elif f == "GMM_n_clust":
-                        line.append(record_txComp['GMM_model']['model'].n_components)
-                    elif f == "cluster_counts":
-                        line.append(record_txComp['GMM_model']['cluster_counts'])
-                    elif f == "Anova_delta_logit":
-                        line.append(record_txComp['GMM_anova_model']['delta_logit'])
-                    elif f == "Logit_LOR":
-                        line.append(record_txComp['GMM_logit_model']['coef'])
-                    else: line.append("NA")
-                fp.write('\t'.join([ str(i) for i in line ])+'\n')
-        fp.close()
+                # retrieve k-mer sequence:
+                ea_query = "SELECT sequence FROM kmers LEFT JOIN reads ON readid = reads.id WHERE transcriptid = ? AND position = ? LIMIT 1"
+                ea_db.cursor.execute(ea_query, (row["transcriptid"], row["kmer"]))
+                seq = ea_db.cursor.fetchone()[0]
+                out_dict = {"transcript": row["name"],
+                            "position": row["kmer"],
+                            "sequence": seq}
+                # TODO: add chromosome, genomic pos., strand information (from where?)
+                if first_row: # check which columns we have (do this only once)
+                    univariate_pvalue_columns = [col for col in row.keys()
+                                                 if ("intensity_pvalue" in col) or ("dwell_pvalue" in col)]
+                    if include_shift_stats:
+                        shift_stat_columns = [col for col in row.keys() if col.startswith(("c1_", "c2_"))]
+                    if with_gmm:
+                        gmm_pvalue_columns = [col for col in row.keys() if "test_pvalue" in col]
+
+                for col in shift_stat_columns:
+                    out_dict[col] = row[col]
+                for col in univariate_pvalue_columns:
+                    out_dict[col] = row[col]
+                if with_gmm:
+                    out_dict["GMM_n_components"] = row["n_components"]
+                    out_dict["GMM_cluster_counts"] = row["cluster_counts"]
+                    out_dict["GMM_test_stat"] = row["test_stat"]
+                    for col in gmm_pvalue_columns:
+                        out_dict[col.replace("test", "GMM", 1)] = row[col]
+
+                if first_row: # write header line
+                    fp.write("\t".join(out_dict.keys()) + "\n")
+                # write output data:
+                fp.write("\t".join(str(x) for x in out_dict.values()) + "\n")
+                first_row = False
 
 
     def save_shift_stats(self, output_fn=None):
@@ -256,25 +227,3 @@ def save_shift_stats(self, output_fn=None):
                     line = [tx, pos, *ss.values()]
                     fp.write('\t'.join([ str(i) for i in line ])+'\n')
         fp.close()
-
-
-    @staticmethod
-    def __multipletests_filter_nan(pvalues, method="fdr_bh"):
-        """
-        Performs p-value correction for multiple hypothesis testing
-        using the method specified. The pvalues list can contain
-        np.nan values, which are ignored during p-value correction.
-        test: input=[0.1, 0.01, np.nan, 0.01, 0.5, 0.4, 0.01, 0.001, np.nan, np.nan, 0.01, np.nan]
-        out: array([0.13333333, 0.016     ,        nan, 0.016     , 0.5       ,
-        0.45714286, 0.016     , 0.008     ,        nan,        nan,
-        0.016     ,        nan])
-        """
-        if all([np.isnan(p) for p in pvalues]):
-            return pvalues
-
-        pvalues_no_nan = [p for p in pvalues if not np.isnan(p)]
-        corrected_p_values = multipletests(pvalues_no_nan, method=method)[1]
-        for i, p in enumerate(pvalues):
-            if np.isnan(p):
-                corrected_p_values = np.insert(corrected_p_values, i, np.nan, axis=0)
-        return(corrected_p_values)

From bc3e8ba5473a18b2f19c3a5d8d01635abf4f5061 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Fri, 23 Jul 2021 20:26:03 +0100
Subject: [PATCH 38/49] PostProcess: remove 'save_shift_stats' (now included in
 'save_report')

---
 nanocompore/PostProcess.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/nanocompore/PostProcess.py b/nanocompore/PostProcess.py
index 3a67c82..d9f1ae5 100644
--- a/nanocompore/PostProcess.py
+++ b/nanocompore/PostProcess.py
@@ -197,33 +197,3 @@ def save_report(self, output_fn:str=None, include_shift_stats:bool=True):
                 # write output data:
                 fp.write("\t".join(str(x) for x in out_dict.values()) + "\n")
                 first_row = False
-
-
-    def save_shift_stats(self, output_fn=None):
-        """
-        Save the mean, median and sd intensity and dwell time for each condition and for each position.
-        This can be used to evaluate the intensity of the shift for significant positions.
-        * output_fn
-            Path to file where to write the data. If None, data is returned to the standard output.
-        """
-        if output_fn is None:
-            fp = sys.stdout
-        elif isinstance(output_fn, str):
-            try:
-                fp = open(output_fn, "w")
-            except:
-                raise NanocomporeError("Error opening output file %s" % output_fn)
-        else:
-            raise NanocomporeError("output_fn needs to be a string or None")
-
-        headers = ['c1_mean_intensity', 'c2_mean_intensity', 'c1_median_intensity', 'c2_median_intensity', 'c1_sd_intensity', 'c2_sd_intensity', 'c1_mean_dwell', 'c2_mean_dwell', 'c1_median_dwell', 'c2_median_dwell', 'c1_sd_dwell', 'c2_sd_dwell']
-        fp.write('\t'.join([ str(i) for i in ["ref_id", "pos"]+headers ])+'\n')
-        for tx, refpos in self:
-            for pos, refpos_list in enumerate(refpos):
-                if "txComp" in refpos_list:
-                    ss = refpos_list['txComp']['shift_stats']
-                    if list(ss.keys()) != headers:
-                        raise NanocomporeError("Mismatch in shift_stats headers")
-                    line = [tx, pos, *ss.values()]
-                    fp.write('\t'.join([ str(i) for i in line ])+'\n')
-        fp.close()

From 479b4439d38e0dd889ec0134ceb40117c2055c85 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Thu, 12 Aug 2021 16:30:19 +0100
Subject: [PATCH 39/49] Eventalign_collapse: remove TSV output option, simplify
 parameters

---
 nanocompore/Eventalign_collapse.py | 122 +++++------------------------
 1 file changed, 21 insertions(+), 101 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index 7c6f6d6..1617ba8 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -37,28 +37,22 @@ class Eventalign_collapse ():
     def __init__(self,
                  eventalign_fn:str,
                  sample_name:str,
-                 outpath:str="./",
-                 outprefix:str="out",
-                 write_db:bool = True,
+                 output_db_path:str,
                  overwrite:bool = False,
                  n_lines:int=None,
                  nthreads:int = 3,
                  progress:bool = False):
+        # TODO: is 'overwrite' a useful option, as data from multiple samples needs to be accumulated in the same DB?
         """
         Collapse the nanopolish eventalign events at kmer level
         * eventalign_fn
             Path to a nanopolish eventalign tsv output file, or a list of file, or a regex (can be gzipped)
         * sample_name
             The name of the sample being processed
-        * outpath
-            Path to the output folder (will be created if it does exist yet)
-        * outprefix
-            text outprefix for all the files generated
-        * write_db
-            Write output to database? (Otherwise to TSV file.)
+        * output_db_path
+            Path to the output (database) file
         * overwrite
-            If the output directory already exists, the standard behaviour is to raise an error to prevent overwriting existing data
-            This option ignore the error and overwrite data if they have the same outpath and outprefix.
+            Overwrite an existing output file?
         * n_lines
             Maximum number of read to parse.
         * nthreads
@@ -77,10 +71,9 @@ def __init__(self,
 
         # Save args to self values
         self.__sample_name = sample_name
-        self.__outpath = outpath
-        self.__outprefix = outprefix
-        self.__write_db = write_db
         self.__eventalign_fn = eventalign_fn
+        self.__output_db_path = output_db_path
+        self.__overwrite = overwrite
         self.__n_lines = n_lines
         self.__nthreads = nthreads - 2 # subtract 1 for reading and 1 for writing
         self.__progress = progress
@@ -106,11 +99,8 @@ def __call__(self):
         ps_list.append (mp.Process (target=self.__split_reads, args=(in_q, error_q)))
         for i in range (self.__nthreads):
             ps_list.append (mp.Process (target=self.__process_read, args=(in_q, out_q, error_q)))
-        if self.__write_db:
-            ps_list.append (mp.Process (target=self.__write_output_to_db, args=(out_q, error_q)))
-            # TODO: Check that sample_name does not exist already in DB
-        else:
-            ps_list.append(mp.Process(target=self.__write_output, args=(out_q, error_q)))
+        # TODO: Check that sample_name does not exist already in DB
+        ps_list.append(mp.Process(target=self.__write_output, args=(out_q, error_q)))
 
         # Start processes and monitor error queue
         try:
@@ -143,7 +133,7 @@ def __call__(self):
 
 
     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PRIVATE METHODS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
-    def __split_reads (self, in_q, error_q):
+    def __split_reads(self, in_q, error_q):
         """
         Mono-threaded reader
         """
@@ -198,7 +188,7 @@ def __split_reads (self, in_q, error_q):
             logger.debug("Parsed Reads:{} Events:{}".format(n_reads, n_events))
 
 
-    def __process_read (self, in_q, out_q, error_q):
+    def __process_read(self, in_q, out_q, error_q):
         """
         Multi-threaded workers collapsing events at kmer level
         """
@@ -236,18 +226,18 @@ def __process_read (self, in_q, out_q, error_q):
             out_q.put(None)
 
 
-    def __write_output_to_db(self, out_q, error_q):
+    def __write_output(self, out_q, error_q):
         """
-        Mono-threaded Writer
+        Single-threaded writer
         """
         logger.debug("Start writing output to DB")
 
-        pr = profile.Profile()
-        pr.enable()
+        # pr = profile.Profile()
+        # pr.enable()
         n_reads = 0
-        db_path = os.path.join(self.__outpath, self.__outprefix+"_nanocompore.db")
+        db_create_mode = DBCreateMode.OVERWRITE if self.__overwrite else DBCreateMode.CREATE_MAYBE
         try:
-            with DataStore_EventAlign(self.__db_path, DBCreateMode.CREATE_MAYBE) as datastore, \
+            with DataStore_EventAlign(self.__output_db_path, db_create_mode) as datastore, \
                  tqdm (unit=" reads") as pbar:
                 # Iterate over out queue until nthread poison pills are found
                 for _ in range (self.__nthreads):
@@ -263,83 +253,13 @@ def __write_output_to_db(self, out_q, error_q):
             logger.info ("Output reads written:{}".format(n_reads))
             # Kill error queue with poison pill
             error_q.put(None)
-            pr.disable()
-            pr.dump_stats("prof")
-
-
-    def __write_output(self, out_q, error_q):
-        """
-        Mono-threaded Writer
-        """
-        logger.debug("Start writing output files")
+            # pr.disable()
+            # pr.dump_stats("prof")
 
-        byte_offset = n_reads = n_kmers = 0
-
-        # Init variables for index files
-        idx_fn = os.path.join(self.__outpath, self.__outprefix+"_eventalign_collapse.tsv.idx")
-        data_fn = os.path.join(self.__outpath, self.__outprefix+"_eventalign_collapse.tsv")
-
-        try:
-            # Open output files and tqdm progress bar
-            with open (data_fn, "w") as data_fp, open (idx_fn, "w") as idx_fp, tqdm (unit=" reads", disable=not self.__progress) as pbar:
-
-                # Iterate over out queue until nthread poison pills are found
-                for _ in range (self.__nthreads):
-                    for read in iter (out_q.get, None):
-                        read_res_d = read.get_read_results()
-                        kmer_res_l = read.get_kmer_results()
-                        n_reads+=1
-
-                        # Define file header from first read and first kmer
-                        if byte_offset == 0:
-                            idx_header_list = list(read_res_d.keys())+["byte_offset","byte_len"]
-                            idx_header_str = "\t".join(idx_header_list)
-                            data_header_list = list(kmer_res_l[0].keys())
-                            data_header_str = "\t".join(data_header_list)
-
-                            # Write index file header
-                            idx_fp.write ("{}\n".format(idx_header_str))
-
-                        # Write data file header
-                        byte_len = 0
-                        header_str = "#{}\t{}\n{}\n".format(read_res_d["read_id"], read_res_d["ref_id"], data_header_str)
-                        data_fp.write(header_str)
-                        byte_len+=len(header_str)
-
-                        # Write kmer data matching data field order
-                        for kmer in kmer_res_l:
-                            n_kmers += 1
-                            data_str = "\t".join([str(kmer[f]) for f in data_header_list]) + "\n"
-                            data_fp.write(data_str)
-                            byte_len += len(data_str)
-
-                        # Add byte
-                        read_res_d["byte_offset"] = byte_offset
-                        read_res_d["byte_len"] = byte_len-1
-                        idx_str = "\t".join([str(read_res_d[f]) for f in idx_header_list])
-                        idx_fp.write("{}\n".format(idx_str))
-
-                        # Update pbar
-                        byte_offset += byte_len
-                        pbar.update(1)
-
-                # Flag last line
-                data_fp.write ("#\n")
-
-        # Manage exceptions and add error trackback to error queue
-        except Exception:
-            logger.error("Error in Writer")
-            error_q.put (NanocomporeError(traceback.format_exc()))
-
-        finally:
-            logger.debug("Written Reads:{} Kmers:{}".format(n_reads, n_kmers))
-            logger.info ("Output reads written:{}".format(n_reads))
-            # Kill error queue with poison pill
-            error_q.put(None)
 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~HELPER CLASSES~~~~~~~~~~~~~~~~~~~~~~~~~~#
 
-class Read ():
+class Read:
     """Helper class representing a single read"""
 
     def __init__ (self, read_id, ref_id, sample_name):
@@ -413,7 +333,7 @@ def get_kmer_results (self):
         l = [kmer.get_results() for kmer in self.kmer_l]
         return l
 
-class Kmer ():
+class Kmer:
     """Helper class representing a single kmer"""
 
     def __init__ (self):

From 543653da47abd79fb34e3713df8e5e541836fce1 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Thu, 12 Aug 2021 16:39:33 +0100
Subject: [PATCH 40/49] SampComp: remove irrelevant data from output queue
 tuple (thanks Tommaso!)

---
 nanocompore/SampComp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index 2575b1c..e7e2c6f 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -333,7 +333,7 @@ def __process_references(self, in_q, out_q, error_q):
 
                 # Add the current read details to queue
                 logger.debug(f"Adding '{ref_id}' to out_q")
-                out_q.put((ref_id, results["kmer_data"], results["test_results"]))
+                out_q.put((ref_id, results["test_results"]))
 
         # Manage exceptions and add error trackback to error queue
         except Exception as e:
@@ -355,7 +355,7 @@ def __write_output_to_db(self, out_q, error_q):
             with db:
                 # Iterate over the counter queue and process items until all poison pills are found
                 for _ in range(self.__nthreads):
-                    for ref_id, kmer_data, test_results in iter(out_q.get, None):
+                    for ref_id, test_results in iter(out_q.get, None):
                         logger.debug("Writer thread storing transcript %s" % ref_id)
                         db.store_test_results(ref_id, test_results)
                         n_tx += 1

From 1721746144fb29e7b757bd4afdbf427afa52a735 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 16 Aug 2021 19:01:04 +0100
Subject: [PATCH 41/49] common: update function to build dict. with sample
 information

---
 nanocompore/common.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/nanocompore/common.py b/nanocompore/common.py
index 6d3f3ad..ca9b7e9 100644
--- a/nanocompore/common.py
+++ b/nanocompore/common.py
@@ -28,14 +28,11 @@ class NanocomporeWarning (Warning):
 
 #~~~~~~~~~~~~~~FUNCTIONS~~~~~~~~~~~~~~#
 
-def build_eventalign_fn_dict(file_list1, file_list2, label1, label2):
+def build_sample_dict(sample_list1, sample_list2, label1, label2):
     """
-    Build the eventalign_fn_dict from file lists and labels
+    Build dictionary with sample information from sample lists and condition labels
     """
-    d = OrderedDict()
-    d[label1] = {"{}_{}".format(label1, i): v for i, v in enumerate(file_list1.split(","),1)}
-    d[label2] = {"{}_{}".format(label2, i): v for i, v in enumerate(file_list2.split(","),1)}
-    return d
+    return {label1: sample_list1.split(","), label2: sample_list2.split(",")}
 
 def check_sample_dict(sample_dict):
     # Check general structure
@@ -46,7 +43,7 @@ def check_sample_dict(sample_dict):
     for condition, samples in sample_dict.items():
         if type(samples) is not list:
             raise NanocomporeError(f"Expected a list of sample names for condition '{condition}'. "
-                                   "Got a '{type(sample_dict)}'.")
+                                   "Got a '{type(samples)}'.")
         if not samples:
             raise NanocomporeError(f"Empty sample list for condition '{condition}'.")
         if len(samples) == 1:

From fc3f0bb2476f72995ddfa13a01f488d6f58889bb Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 16 Aug 2021 19:01:34 +0100
Subject: [PATCH 42/49] DataStore: add to-do comment

---
 nanocompore/DataStore.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index 47fcba5..d986064 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -261,6 +261,7 @@ def get_samples(self, sample_dict=None):
                 raise NanocomporeError(f"Sample '{sample}' not present in database")
         return db_samples
 
+    # TODO: is this function never used?
     def store_sample_info(self, sample_dict):
         if not self._connection:
             raise NanocomporeError("Database connection not yet opened")

From 3219601a5274ee0148d1272975c22a146509871b Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Mon, 16 Aug 2021 19:03:34 +0100
Subject: [PATCH 43/49] main: update command line options

---
 nanocompore/Eventalign_collapse.py |   2 +-
 nanocompore/SampComp.py            |   2 +-
 nanocompore/__main__.py            | 328 ++++++++++++++++-------------
 3 files changed, 179 insertions(+), 153 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index 1617ba8..7575616 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -28,7 +28,7 @@
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
 
-log_level_dict = {"debug":"DEBUG", "info":"INFO", "warning":"WARNING"}
+log_level_dict = {"debug": "DEBUG", "info": "INFO", "warning": "WARNING"}
 #logger.remove()
 
 #~~~~~~~~~~~~~~MAIN CLASS~~~~~~~~~~~~~~#
diff --git a/nanocompore/SampComp.py b/nanocompore/SampComp.py
index e7e2c6f..7303b4d 100644
--- a/nanocompore/SampComp.py
+++ b/nanocompore/SampComp.py
@@ -81,7 +81,7 @@ def __init__(self,
             Whitelist object previously generated with nanocompore Whitelist.
             If not given, will be automatically generated.
         * univariate_test
-            Statistical test to compare the two samples ('MW' for Mann-Whitney, 'KS' for Kolmogorov-Smirnov or 'ST' for Student's t), or empty for no test.
+            Statistical test to compare the two conditions ('MW' for Mann-Whitney, 'KS' for Kolmogorov-Smirnov or 'ST' for Student's t), or empty for no test.
         * fit_gmm
             Fit a Gaussian mixture model (GMM) to the intensity/dwell-time distribution?
         * gmm_test
diff --git a/nanocompore/__main__.py b/nanocompore/__main__.py
index 314906d..9bb46d3 100644
--- a/nanocompore/__main__.py
+++ b/nanocompore/__main__.py
@@ -23,73 +23,112 @@
 
 #~~~~~~~~~~~~~~MAIN PARSER ENTRY POINT~~~~~~~~~~~~~~#
 
-def main(args=None):
+def main():
     # General parser
     parser = argparse.ArgumentParser(description=package_description, formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument('--version', '-v', action='version', version='v'+package_version)
+    parser.add_argument('--version', '-v', action='version', version='v' + package_version)
     subparsers = parser.add_subparsers(dest='subcommand',
         description=textwrap.dedent("""
             nanocompore implements the following subcommands\n
-            \t* eventalign_collapse : Collapse the nanopolish eventalign output at kmers level and compute kmer level statistics\n
+            \t* eventalign_collapse : Collapse the nanopolish eventalign output at kmer level and compute kmer-level statistics\n
             \t* sampcomp : Compare 2 samples and find significant signal differences\n
-            \t* simreads : Simulate reads as a NanopolishComp like file from a fasta file and an inbuild model"""))
+            \t* simreads : Simulate reads as a NanopolishComp-like file from a FASTA file and a built-in model"""))
     subparsers.required = True
 
-    # Sampcomp subparser
+    # Eventalign_collapse subparser
+    parser_ec = subparsers.add_parser("eventalign_collapse", formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent("""
+        Collapse the nanopolish eventalign output at kmer level and compute kmer-level statistics
+        * Minimal example:
+            nanocompore eventalign_collapse -i nanopolish_eventalign.tsv -s T1 -o eventalign_collapse.db\n"""))
+    parser_ec.set_defaults(func=eventalign_collapse_main)
+
+    parser_ec_in = parser_ec.add_argument_group("Input options")
+    parser_ec_in.add_argument("--input", "-i", default=0,
+        help="Path to a nanopolish eventalign tsv file, or a list of files, or a regex (can be gzipped). It can be ommited if piped to standard input (default: piped to stdin)")
+    parser_ec_in.add_argument("--sample", "-s", default=None, required=True, help="Unique identifier of the sample")
+
+    parser_ec_out = parser_ec.add_argument_group("Output options")
+    parser_ec_out.add_argument("--output", "-o", default="eventalign_collapse.db",
+                               help="Path or filename of database output file (default: %(default)s)")
+
+    parser_ec_run = parser_ec.add_argument_group("Run options")
+    parser_ec_run.add_argument("--n_lines", "-l", default=None , type=int ,
+                               help = "Number of lines to parse (default: no limit")
+
+    parser_ec_misc = parser_ec.add_argument_group("Other options")
+    parser_ec_misc.add_argument("--nthreads", "-t", default=3, type=int,
+        help="Total number of threads. 2 threads are reserved for the reader and the writer (default: %(default)s)")
+
+    # SampComp subparser
     parser_sc = subparsers.add_parser('sampcomp', formatter_class=argparse.RawDescriptionHelpFormatter,
         description=textwrap.dedent("""
             Compare 2 samples and find significant signal differences\n
             * Minimal example with file_list arguments\n
-                nanocompore sampcomp -1 f1.tsv,f2.tsv -2 f3.tsv,f4.tsv -f ref.fa -o results
+                nanocompore sampcomp -1 C1,C2 -2 T1,T2 -f ref.fa -o results
             * Minimal example with sample YAML file\n
                 nanocompore sampcomp -y samples.yaml -f ref -o results"""))
     parser_sc.set_defaults(func=sampcomp_main)
-    parser_sc_sample_yaml = parser_sc.add_argument_group('YAML sample files', description="Option allowing to describe sample files in a YAML file")
-    parser_sc_sample_yaml.add_argument("--sample_yaml", "-y", default=None, type=str, metavar="sample_yaml",
-        help="YAML file containing the sample file labels. See formatting in documentation. (required if --file_list1 and --file_list2 not given)")
-    parser_sc_sample_args = parser_sc.add_argument_group('Arguments sample files', description="Option allowing to describe sample files directly as command line arguments")
-    parser_sc_sample_args.add_argument("--file_list1", "-1", default=None, type=str, metavar="/path/to/Condition1_rep1,/path/to/Condition1_rep2",
-        help="Comma separated list of NanopolishComp files for label 1. (required if --sample_yaml not given)")
-    parser_sc_sample_args.add_argument("--file_list2", "-2", default=None, type=str, metavar="/path/to/Condition2_rep1,/path/to/Condition2_rep2",
-        help="Comma separated list of NanopolishComp files for label 2. (required if --sample_yaml not given)")
-    parser_sc_sample_args.add_argument("--label1", type=str, metavar="Condition1", default="Condition1",
-        help="Label for files in --file_list1 (default: %(default)s)")
-    parser_sc_sample_args.add_argument("--label2", type=str, metavar="Condition2", default="Condition2",
-        help="Label for files in --file_list2 (default: %(default)s)")
-    parser_sc_io = parser_sc.add_argument_group('Input options')
-    parser_sc_io.add_argument("--fasta", "-f", type=str, required=True,
-        help="Fasta file used for mapping (required)")
-    parser_sc_io.add_argument("--bed", type=str, default=None,
-        help="BED file with annotation of transcriptome used for mapping (optional)")
-    parser_sc_filtering = parser_sc.add_argument_group('Transcript filtering options')
-    parser_sc_filtering.add_argument("--max_invalid_kmers_freq", type=float, default=0.1,
-        help="Max fequency of invalid kmers (default: %(default)s)")
-    parser_sc_filtering.add_argument("--min_coverage", type=int, default=30,
-        help="Minimum coverage required in each condition to do the comparison (default: %(default)s)")
-    parser_sc_filtering.add_argument("--downsample_high_coverage", type=int, default=5000,
-        help="Transcripts with high coverage will be downsampled (default: %(default)s)")
-    parser_sc_filtering.add_argument("--min_ref_length", type=int, default=100,
-        help="Minimum length of a reference transcript to include it in the analysis (default: %(default)s)")
-    parser_sc_testing = parser_sc.add_argument_group('Statistical testing options')
-    parser_sc_testing.add_argument("--comparison_methods", type=str, default="GMM,KS",
-        help="Comma separated list of comparison methods. Valid methods are: GMM,KS,TT,MW. (default: %(default)s)")
-    parser_sc_testing.add_argument("--sequence_context", type=int, default=0, choices=range(0,5),
-        help="Sequence context for combining p-values (default: %(default)s)")
-    parser_sc_testing.add_argument("--sequence_context_weights", type=str, default="uniform", choices=["uniform", "harmonic"],
-        help="Type of weights to use for combining p-values")
-    parser_sc_testing.add_argument("--pvalue_thr", type=float, default=0.05,
+
+    # TODO: YAML input option still needed?
+    # parser_sc_sample_yaml = parser_sc.add_argument_group('YAML sample files', description="Option allowing to describe sample files in a YAML file")
+    # parser_sc_sample_yaml.add_argument("--sample_yaml", "-y", default=None, type=str, metavar="sample_yaml",
+    #     help="YAML file containing the sample file labels. See formatting in documentation. (Required if --file_list1 and --file_list2 not given)")
+
+    parser_sc_in = parser_sc.add_argument_group('Input options')
+    parser_sc_in.add_argument("--input", "-i", required=True,
+                              help="Path to the input database, i.e. 'eventalign_collapse' output (required)")
+    parser_sc_in.add_argument("--fasta", "-f", required=True,
+                              help="Fasta file used for mapping (required)")
+    parser_sc_in.add_argument("--bed", default=None,
+                              help="BED file with annotation of transcriptome used for mapping (optional)")
+    parser_sc_in.add_argument("--samples1", "-1", required=True, metavar="C1,C2",
+        help="Comma-separated list of sample identifiers for condition 1 (e.g. control).")
+    parser_sc_in.add_argument("--samples2", "-2", required=True, metavar="T1,T2",
+        help="Comma-separated list of sample identifiers for condition 2 (e.g. treatment).")
+    # TODO: where are these labels used?
+    parser_sc_in.add_argument("--label1", metavar="Condition1", default="Control",
+                              help="Label for condition 1 (default: %(default)s)")
+    parser_sc_in.add_argument("--label2", metavar="Condition2", default="Treatment",
+                              help="Label for condition 2 (default: %(default)s)")
+
+    parser_sc_out = parser_sc.add_argument_group("Output options")
+    parser_sc_out.add_argument("--output", "-o", default="sampcomp.db",
+                               help="Path or filename of database output file (default: %(default)s)")
+    parser_sc_out.add_argument("--report", "-r", default="sampcomp.tsv",
+                               help="Path or filename of report output file (default: %(default)s)")
+
+    parser_sc_filter = parser_sc.add_argument_group("Transcript filtering options")
+    parser_sc_filter.add_argument("--max_invalid_kmers_freq", type=float, default=0.1,
+        help="Maximum fequency of invalid kmers (default: %(default)s)")
+    parser_sc_filter.add_argument("--min_coverage", type=int, default=30,
+        help="Minimum coverage required in each condition to perform the comparison (default: %(default)s)")
+    parser_sc_filter.add_argument("--downsample_high_coverage", type=int, default=5000,
+        help="Downsample transcripts with high coverage to this number of reads (default: %(default)s)")
+    parser_sc_filter.add_argument("--min_ref_length", type=int, default=100,
+        help="Minimum length of a reference transcript for inclusion in the analysis (default: %(default)s)")
+
+    parser_sc_test = parser_sc.add_argument_group('Statistical testing options')
+    parser_sc_test.add_argument("--pvalue_threshold", "-p", type=float, default=0.05,
         help="Adjusted p-value threshold for reporting significant sites (default: %(default)s)")
-    parser_sc_testing.add_argument("--logit", action='store_true',
-        help="Use logistic regression testing downstream of GMM method. This is a legacy option and is now the deault.")
-    parser_sc_testing.add_argument("--anova", action='store_true',
-        help="Use Anova test downstream of GMM method (default: %(default)s)")
-    parser_sc_testing.add_argument("--allow_warnings", action='store_true', default=False,
-        help="If True runtime warnings during the ANOVA tests don't raise an error (default: %(default)s)")
+    parser_sc_test.add_argument("--univariate_test", choices=["KS", "MW", "ST", "none"], default="KS",
+        help="Univariate test for comparing kmer data between conditions. KS: Kolmogorov-Smirnov test, MW: Mann-Whitney test, ST: Student's t-test, none: no univariate test. (default: %(default)s)")
+    parser_sc_test.add_argument("--no_gmm", action="store_true",
+        help="Do not perform the GMM fit and subsequent test (see --gmm_test) (default: %(default)s)")
+    parser_sc_test.add_argument("--gmm_test", choices=["logit", "anova", "none"], default="logit",
+        help="Statistical test performed after GMM fitting (unless --no_gmm is used). (default: %(default)s)")
+    parser_sc_test.add_argument("--allow_warnings", action="store_true",
+                                help="If True runtime warnings during the ANOVA tests (see --gmm_test) don't raise an error (default: %(default)s)")
+    parser_sc_test.add_argument("--sequence_context", type=int, default=0, choices=range(0,5),
+        help="Sequence context for combining p-values (default: %(default)s)")
+    parser_sc_test.add_argument("--sequence_context_weights", default="uniform", choices=["uniform", "harmonic"],
+        help="Type of position weighting to use for combining p-values (default: %(default)s)")
+
     parser_sc_misc = parser_sc.add_argument_group('Other options')
     parser_sc_misc.add_argument("--nthreads", "-t", type=int, default=3,
-        help="Number of threads (default: %(default)s)")
+                                help="Number of threads (default: %(default)s)")
 
-    # simreads subparser
+    # SimReads subparser
     parser_sr = subparsers.add_parser('simreads', formatter_class=argparse.RawDescriptionHelpFormatter,
         description=textwrap.dedent("""
             Simulate reads as a NanopolishComp like file from a fasta file and an inbuild model\n
@@ -98,9 +137,15 @@ def main(args=None):
             * Minimal example with alteration of model intensity loc parameter for 50% of the reads
             nanocompore simreads -f ref.fa -o results -n 50 --intensity_mod 2 --mod_reads_freq 0.5 --mod_bases_freq 0.2"""))
     parser_sr.set_defaults(func=simreads_main)
-    parser_sr_io = parser_sr.add_argument_group('Input options')
-    parser_sr_io.add_argument("--fasta", "-f", type=str, required=True,
-        help="Fasta file containing references to use to generate artificial reads")
+
+    parser_sr_in = parser_sr.add_argument_group('Input options')
+    parser_sr_in.add_argument("--fasta", "-f", required=True,
+        help="FASTA file containing transcript sequences to use for artificial reads")
+
+    parser_sr_out = parser_sr.add_argument_group("Output options")
+    parser_sr_out.add_argument("--output", "-o", default="out",
+                               help="Prefix for output files (default: %(default)s)")
+
     parser_sr_modify = parser_sr.add_argument_group('Signal modification options')
     parser_sr_modify.add_argument("--intensity_mod", type=float, default=0,
         help="Fraction of intensity distribution SD by which to modify the intensity distribution loc value (default: %(default)s)")
@@ -110,66 +155,45 @@ def main(args=None):
         help="Frequency of reads to modify (default: %(default)s)")
     parser_sr_modify.add_argument("--mod_bases_freq", type=float, default=0.25,
         help="Frequency of bases to modify in each read (if possible) (default: %(default)s)")
-    parser_sr_modify.add_argument("--mod_bases_type", type=str, default="A", choices=["A","T","C","G"],
+    parser_sr_modify.add_argument("--mod_bases_type", default="A", choices=["A","T","C","G"],
         help="Base for which to modify the signal (default: %(default)s)")
     parser_sr_modify.add_argument("--mod_extend_context", type=int, default=2,
         help="number of adjacent base affected by the signal modification following an harmonic series (default: %(default)s)")
     parser_sr_modify.add_argument("--min_mod_dist", type=int, default=6,
         help="Minimal distance between 2 bases to modify (default: %(default)s)")
     parser_sr_misc = parser_sr.add_argument_group('Other options')
-    parser_sr_misc.add_argument("--run_type", type=str, default="RNA", choices=["RNA", "DNA"],
+    parser_sr_misc.add_argument("--run_type", default="RNA", choices=["RNA", "DNA"],
         help="Define the run type model to import (default: %(default)s)")
     parser_sr_misc.add_argument("--nreads_per_ref", "-n", type=int, default=100,
         help="Number of reads to generate per references (default: %(default)s)")
     parser_sr_misc.add_argument("--pos_rand_seed", type=int, default=42 ,
-        help="Define a seed for randon position picking to get a deterministic behaviour (default: %(default)s)")
+        help="Define a seed for random position picking to get a deterministic behaviour (default: %(default)s)")
     parser_sr_misc.add_argument("--not_bound", action='store_true', default=False,
         help="Do not bind the values generated by the distributions to the observed min and max observed values from the model file (default: %(default)s)")
 
-    # Eventalign_collapse subparser
-    parser_ec = subparsers.add_parser("eventalign_collapse", formatter_class=argparse.RawDescriptionHelpFormatter,
-        description=textwrap.dedent("""
-        Collapse the nanopolish eventalign output at kmers level and compute kmer level statistics
-        * Minimal example
-            nanocompore eventalign_collapse -i nanopolish_eventalign.tsv -outprefix out\n"""))
-    parser_ec.set_defaults(func=eventalign_collapse_main)
-    parser_ec_io = parser_ec.add_argument_group("Input options")
-    parser_ec_io.add_argument("--eventalign", "-i", default=0,
-        help="Path to a nanopolish eventalign tsv output file, or a list of file, or a regex (can be gzipped). It can be ommited if piped to standard input (default: piped to stdin)")
-    parser_ec_io.add_argument("--sample_name", "-s", default=None, required=True, help="Unique identifier of the sample")
-    parser_ec_rp = parser_ec.add_argument_group("Run parameters options")
-    parser_ec_rp.add_argument("--n_lines", "-l", default=None , type=int ,
-        help = "Number of lines to parse.(default: no limits")
-    parser_ec_misc = parser_ec.add_argument_group("Other options")
-    parser_ec_misc.add_argument("--nthreads", "-t", default=3, type=int,
-        help="Total number of threads. 2 threads are reserved for the reader and the writer (default: %(default)s)")
-
     # Add common options for all parsers
+    for out_group in [parser_ec_out, parser_sc_out]:
+        out_group.add_argument("--outdir", "-d", default="",
+            help="Directory for output files. Will be preprended to --output if given. (default: %(default)s)")
+        out_group.add_argument("--overwrite", "-w", action="store_true",
+                               help="Overwrite existing output files? (default: %(default)s)")
     for sp in [parser_sc, parser_sr, parser_ec]:
-        sp_output = sp.add_argument_group("Output options")
-        sp_output.add_argument("--outpath", "-o", type=str, default="./",
-            help="Path to the output folder (default: %(default)s)")
-        sp_output.add_argument("--outprefix", "-p", type=str, default="out",
-            help="text outprefix for all the files generated (default: %(default)s)")
-        sp_output.add_argument("--overwrite", "-w", action='store_true', default=False,
-            help="Use --outpath even if it exists already (default: %(default)s)")
         sp_verbosity = sp.add_argument_group("Verbosity options")
-        sp_verbosity.add_argument("--log_level", type=str, default="info", choices=["warning", "info", "debug"],
-            help="Set the log level (default: %(default)s)")
-        sp_verbosity.add_argument("--progress", default=False, action='store_true',
-            help="Display a progress bar during execution (default: %(default)s)")
+        sp_verbosity.add_argument("--log_level", default="info", choices=["warning", "info", "debug"],
+                                  help="Set the log level (default: %(default)s)")
+        sp_verbosity.add_argument("--progress", action="store_true",
+                                  help="Display a progress bar during execution (default: %(default)s)")
 
-    # Parse agrs and
     args = parser.parse_args()
 
     # Check if output folder already exists
     try:
-        mkdir(fn=args.outpath, exist_ok=args.overwrite)
+        mkdir(fn=args.outdir, exist_ok=True)
     except (NanocomporeError, FileExistsError) as E:
-        raise NanocomporeError("Could not create the output folder. Try using `--overwrite` option or use another directory")
+        raise NanocomporeError(f"Could not create the output folder: {args.outdir}")
 
     # Set logger
-    log_fn = os.path.join(args.outpath, args.outprefix+"_{}.log".format(vars(args)["subcommand"]))
+    log_fn = os.path.join(args.outdir, vars(args)["subcommand"] + ".log")
     set_logger(args.log_level, log_fn=log_fn)
 
     # Call relevant subfunction
@@ -177,86 +201,88 @@ def main(args=None):
 
 #~~~~~~~~~~~~~~SUBCOMMAND FUNCTIONS~~~~~~~~~~~~~~#
 
+def eventalign_collapse_main(args):
+    """"""
+    logger.warning("Running Eventalign_collapse")
+
+    outpath = args.output
+    if args.outdir:
+        outpath = os.path.normpath(os.path.join(args.outdir, outpath))
+
+    # Init Eventalign_collapse
+    e = Eventalign_collapse(eventalign_fn = args.input,
+                            sample_name = args.sample,
+                            output_db_path = outpath,
+                            overwrite = args.overwrite,
+                            n_lines = args.n_lines,
+                            nthreads = args.nthreads,
+                            progress = args.progress)
+
+    # Run eventalign_collapse
+    e()
+
 def sampcomp_main(args):
     """"""
     logger.warning("Running SampComp")
 
-    # Load eventalign_fn_dict from a YAML file or assemble eventalign_fn_dict for the command line option
-    if args.sample_yaml:
-        eventalign_fn_dict = args.sample_yaml
-    elif args.file_list1 and args.file_list2:
-        eventalign_fn_dict = build_eventalign_fn_dict(args.file_list1, args.file_list2, args.label1, args.label2)
-    else:
-        raise NanocomporeError("Samples eventalign files have to be provided with either `--sample_yaml` or `--file_list1` and `--file_list2`")
+    outpath = args.output
+    if args.outdir:
+        outpath = os.path.normpath(os.path.join(args.outdir, outpath))
+
+    sample_dict = build_sample_dict(args.samples1, args.samples2, args.label1, args.label2)
+
+    univar_test = args.univariate_test if args.univariate_test != "none" else None
+    gmm_test = args.gmm_test if args.gmm_test != "none" else None
 
     # Init SampComp
-    s = SampComp(
-        eventalign_fn_dict = eventalign_fn_dict,
-        max_invalid_kmers_freq = args.max_invalid_kmers_freq,
-        outpath = args.outpath,
-        outprefix = args.outprefix,
-        overwrite = args.overwrite,
-        fasta_fn = args.fasta,
-        bed_fn = args.bed,
-        nthreads = args.nthreads,
-        min_coverage = args.min_coverage,
-        min_ref_length = args.min_ref_length,
-        downsample_high_coverage = args.downsample_high_coverage,
-        comparison_methods = args.comparison_methods,
-        logit = True,
-        anova = args.anova,
-        allow_warnings = args.allow_warnings,
-        sequence_context = args.sequence_context,
-        sequence_context_weights = args.sequence_context_weights,
-        progress = args.progress)
+    s = SampComp(input_db_path = args.input,
+                 output_db_path = outpath,
+                 sample_dict = sample_dict,
+                 fasta_fn = args.fasta,
+                 overwrite = args.overwrite,
+                 whitelist = None,
+                 univariate_test = univar_test,
+                 fit_gmm = not args.no_gmm,
+                 gmm_test = gmm_test,
+                 allow_anova_warnings = args.allow_warnings,
+                 sequence_context = args.sequence_context,
+                 sequence_context_weighting = args.sequence_context_weights,
+                 min_coverage = args.min_coverage,
+                 min_ref_length = args.min_ref_length,
+                 downsample_high_coverage = args.downsample_high_coverage,
+                 max_invalid_kmers_freq = args.max_invalid_kmers_freq,
+                 nthreads = args.nthreads,
+                 progress = args.progress)
 
     # Run SampComp
-    db = s()
+    s()
 
     # Save all reports
-    if(db):
-        db.save_all(pvalue_thr=args.pvalue_thr)
+    p = PostProcess(outpath, args.input, args.bed)
+    p.save_report(args.report) # TODO: update "save_all()" and call that instead
+
 
 def simreads_main(args):
     """"""
     logger.warning("Running SimReads")
 
     # Run SimReads
-    SimReads(
-        fasta_fn = args.fasta,
-        outpath = args.outpath,
-        outprefix = args.outprefix,
-        overwrite = args.overwrite,
-        run_type = args.run_type,
-        nreads_per_ref = args.nreads_per_ref,
-        intensity_mod = args.intensity_mod,
-        dwell_mod = args.dwell_mod,
-        mod_reads_freq = args.mod_reads_freq,
-        mod_bases_freq = args.mod_bases_freq,
-        mod_bases_type = args.mod_bases_type,
-        mod_extend_context = args.mod_extend_context,
-        min_mod_dist = args.min_mod_dist,
-        pos_rand_seed = args.pos_rand_seed,
-        not_bound = args.not_bound,
-        progress = args.progress)
-
-def eventalign_collapse_main (args):
-    """"""
-    logger.warning("Running Eventalign_collapse")
-
-    # Init Eventalign_collapse
-    e = Eventalign_collapse (
-        eventalign_fn = args.eventalign,
-        sample_name = args.sample_name,
-        outpath = args.outpath,
-        outprefix = args.outprefix,
-        overwrite = args.overwrite,
-        n_lines = args.n_lines,
-        nthreads = args.nthreads,
-        progress = args.progress)
-
-    # Run eventalign_collapse
-    e()
+    SimReads(fasta_fn = args.fasta,
+             outpath = args.outdir,
+             outprefix = args.output,
+             overwrite = args.overwrite,
+             run_type = args.run_type,
+             nreads_per_ref = args.nreads_per_ref,
+             intensity_mod = args.intensity_mod,
+             dwell_mod = args.dwell_mod,
+             mod_reads_freq = args.mod_reads_freq,
+             mod_bases_freq = args.mod_bases_freq,
+             mod_bases_type = args.mod_bases_type,
+             mod_extend_context = args.mod_extend_context,
+             min_mod_dist = args.min_mod_dist,
+             pos_rand_seed = args.pos_rand_seed,
+             not_bound = args.not_bound,
+             progress = args.progress)
 
 #~~~~~~~~~~~~~~CLI ENTRYPOINT~~~~~~~~~~~~~~#
 

From e7ef11ed846b4e1ab47ac12b76f22e12cacbcea2 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 17 Aug 2021 10:56:14 +0100
Subject: [PATCH 44/49] main: fix PostProcess (TSV export) usage

---
 nanocompore/__main__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nanocompore/__main__.py b/nanocompore/__main__.py
index 9bb46d3..3637ccf 100644
--- a/nanocompore/__main__.py
+++ b/nanocompore/__main__.py
@@ -19,6 +19,7 @@
 from nanocompore.SampComp import SampComp
 from nanocompore.SimReads import SimReads
 from nanocompore.Eventalign_collapse import Eventalign_collapse
+from nanocompore.PostProcess import PostProcess
 from nanocompore.common import *
 
 #~~~~~~~~~~~~~~MAIN PARSER ENTRY POINT~~~~~~~~~~~~~~#
@@ -258,8 +259,12 @@ def sampcomp_main(args):
     s()
 
     # Save all reports
+    report_path = args.report
+    if args.outdir:
+        report_path = os.path.normpath(os.path.join(args.outdir, report_path))
+
     p = PostProcess(outpath, args.input, args.bed)
-    p.save_report(args.report) # TODO: update "save_all()" and call that instead
+    p.save_report(report_path) # TODO: update "save_all()" and call that instead
 
 
 def simreads_main(args):

From f6d82ee13a1201b4bcf663616354db121d9fd419 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Tue, 17 Aug 2021 11:48:29 +0100
Subject: [PATCH 45/49] main: update CLI documentation (minimal examples), make
 report generation optional

---
 nanocompore/__main__.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/nanocompore/__main__.py b/nanocompore/__main__.py
index 3637ccf..4dcfcda 100644
--- a/nanocompore/__main__.py
+++ b/nanocompore/__main__.py
@@ -32,7 +32,7 @@ def main():
         description=textwrap.dedent("""
             nanocompore implements the following subcommands\n
             \t* eventalign_collapse : Collapse the nanopolish eventalign output at kmer level and compute kmer-level statistics\n
-            \t* sampcomp : Compare 2 samples and find significant signal differences\n
+            \t* sampcomp : Compare samples from two conditions and find significant signal differences\n
             \t* simreads : Simulate reads as a NanopolishComp-like file from a FASTA file and a built-in model"""))
     subparsers.required = True
 
@@ -41,7 +41,7 @@ def main():
         description=textwrap.dedent("""
         Collapse the nanopolish eventalign output at kmer level and compute kmer-level statistics
         * Minimal example:
-            nanocompore eventalign_collapse -i nanopolish_eventalign.tsv -s T1 -o eventalign_collapse.db\n"""))
+            nanocompore eventalign_collapse -i nanopolish_eventalign.tsv -s T1"""))
     parser_ec.set_defaults(func=eventalign_collapse_main)
 
     parser_ec_in = parser_ec.add_argument_group("Input options")
@@ -65,10 +65,8 @@ def main():
     parser_sc = subparsers.add_parser('sampcomp', formatter_class=argparse.RawDescriptionHelpFormatter,
         description=textwrap.dedent("""
             Compare 2 samples and find significant signal differences\n
-            * Minimal example with file_list arguments\n
-                nanocompore sampcomp -1 C1,C2 -2 T1,T2 -f ref.fa -o results
-            * Minimal example with sample YAML file\n
-                nanocompore sampcomp -y samples.yaml -f ref -o results"""))
+            * Minimal example:\n
+                nanocompore sampcomp -i eventalign_collapse.db -1 C1,C2 -2 T1,T2 -f ref.fa"""))
     parser_sc.set_defaults(func=sampcomp_main)
 
     # TODO: YAML input option still needed?
@@ -259,6 +257,9 @@ def sampcomp_main(args):
     s()
 
     # Save all reports
+    if not args.report:
+        return
+
     report_path = args.report
     if args.outdir:
         report_path = os.path.normpath(os.path.join(args.outdir, report_path))

From 75193d7e8965a08c9933ab134dbdedd283ad9a6e Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 25 Aug 2021 14:20:14 +0100
Subject: [PATCH 46/49] Eventalign_collapse: small optimization (input file
 reading)

---
 nanocompore/Eventalign_collapse.py | 35 +++++++++++++++---------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/nanocompore/Eventalign_collapse.py b/nanocompore/Eventalign_collapse.py
index 7575616..6ad8439 100644
--- a/nanocompore/Eventalign_collapse.py
+++ b/nanocompore/Eventalign_collapse.py
@@ -80,7 +80,7 @@ def __init__(self,
 
         # Input file field selection typing and renaming
         self.__select_colnames = ["contig", "read_name", "position", "reference_kmer", "model_kmer", "event_length", "samples"]
-        self.__change_colnames = {"contig":"ref_id" ,"position":"ref_pos", "read_name":"read_id", "samples":"sample_list", "event_length":"dwell_time"}
+        self.__change_colnames = {"contig": "ref_id", "position": "ref_pos", "read_name": "read_id", "samples": "sample_list", "event_length": "dwell_time"}
         self.__cast_colnames = {"ref_pos":int, "dwell_time":np.float32, "sample_list":lambda x: [float(i) for i in x.split(",")]}
 
 
@@ -143,38 +143,37 @@ def __split_reads(self, in_q, error_q):
 
         try:
             # Open input file with superParser
+            # TODO: benchmark performance compared to csv.DictReader (std. lib.)
             with SuperParser(
                 fn = self.__eventalign_fn,
                 select_colnames = self.__select_colnames,
                 cast_colnames = self.__cast_colnames,
                 change_colnames = self.__change_colnames,
-                n_lines=self.__n_lines) as sp:
+                n_lines = self.__n_lines) as sp:
 
-                for l in sp:
-                    n_events+=1
-
-                    # First event exception
-                    if n_events==1:
-                        cur_ref_id = l["ref_id"]
-                        cur_read_id = l["read_id"]
-                        event_l = [l]
+                # First line/event - initialise
+                l = next(iter(sp))
+                # TODO: read ID should be unique, so no need to check transcript - correct?
+                cur_read_id = l["read_id"]
+                event_l = [l]
+                n_events = 1
 
-                    # Same read/ref group = just append to current event group
-                    elif l["ref_id"] == cur_ref_id and l["read_id"] == cur_read_id:
+                # All following lines
+                for l in sp:
+                    n_events += 1
+                    # Same read = just append to current event group
+                    if l["read_id"] == cur_read_id:
                         event_l.append(l)
-
                     # If new read/ref group detected = enqueue previous event group and start new one
                     else:
-                        n_reads+=1
+                        n_reads += 1
                         in_q.put(event_l)
-
-                        cur_ref_id = l["ref_id"]
                         cur_read_id = l["read_id"]
                         event_l = [l]
 
-                # Last event line exception
+                # Last event/line
                 in_q.put(event_l)
-                n_reads+=1
+                n_reads += 1
 
         # Manage exceptions and add error trackback to error queue
         except Exception:

From d967101bf51a3f8b32acb557e33e723f93e8c909 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Wed, 25 Aug 2021 14:21:07 +0100
Subject: [PATCH 47/49] SuperParser: add spaces (coding style)

---
 nanocompore/SuperParser.py | 40 +++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/nanocompore/SuperParser.py b/nanocompore/SuperParser.py
index ea8b528..e4cf46c 100644
--- a/nanocompore/SuperParser.py
+++ b/nanocompore/SuperParser.py
@@ -20,18 +20,20 @@ def __init__ (self,
         change_colnames={},
         cast_colnames={}):
         """
-        Open a parser for field delimited file and return an iterator yield lines as namedtuples
-        Transparently parse gziped and multiple file with the same header
+        Open a parser for field-delimited files and return an iterator yielding lines as namedtuples
+        Transparently parse gzipped files and multiple files with the same header
         * fn
             Path to a field delimited file or a list of files or a regex or a mix of everything (Can be gzipped)
         * select_colnames
-            List of column names to use parse and return
+            List of column names to parse and return
         * sep
             field separator
         * comment
             skip any line starting with this string
+        * change_colnames
+            Dict with mapping (from: to) for changing column names
         * cast_colnames
-            Dict corresponding to fields (based on colnames) to cast in a given python type
+            Dict corresponding to fields (based on colnames) to cast into a given python type
         """
 
         # Init logger and counter
@@ -43,7 +45,7 @@ def __init__ (self,
         self._n_lines = n_lines
 
         # Input file opening
-        self.f_list = self._open_files (fn)
+        self.f_list = self._open_files(fn)
 
         # Define colnames based on file header. It needs to be the same for all the files to parse
         fn, fp = self.f_list[0]
@@ -88,16 +90,16 @@ def __init__ (self,
 
     #~~~~~~~~~~~~~~MAGIC AND PROPERTY METHODS~~~~~~~~~~~~~~#
 
-    def __len__ (self):
+    def __len__(self):
         size = 0
         for fn, fp in self.f_list:
             size+= int(os.path.getsize(fn))
         return size-self._header_len
 
-    def __enter__ (self):
+    def __enter__(self):
         return self
 
-    def close (self):
+    def close(self):
         for i, j in self.counter.most_common():
             logger.debug("{}: {}".format(i, j))
         for fn, fp in self.f_list:
@@ -110,26 +112,26 @@ def close (self):
     def __exit__(self, exception_type, exception_val, trace):
         self.close()
 
-    def __iter__ (self):
+    def __iter__(self):
         # Iterate over files
         for fn, fp in self.f_list:
             logger.debug("Starting to parse file {}".format(fn))
             # Iterate over line in file
             for line in fp:
-                self.counter["Lines Parsed"]+=1
+                self.counter["Lines Parsed"] += 1
 
                 if self._comment and line.startswith(self._comment):
-                    self.counter["Comment lines skipped"]+=1
+                    self.counter["Comment lines skipped"] += 1
                     continue
                 try:
                     line = self._parse_line(line)
-                    self.counter["Lines successfully parsed"]+=1
+                    self.counter["Lines successfully parsed"] += 1
                     yield line
                     # early stopping condition
                     if self._n_lines and self.counter["Lines successfully parsed"] == self._n_lines:
                         return
                 except (SuperParserError, TypeError) as E:
-                    self.counter["Malformed or Invalid Lines"]+=1
+                    self.counter["Malformed or Invalid Lines"] += 1
                     logger.debug(E)
                     logger.debug("File {}: Invalid line {}".format(fn, line))
             logger.debug("End of file: {}".format(fn))
@@ -137,12 +139,12 @@ def __iter__ (self):
 
     #~~~~~~~~~~~~~~PRIVATE METHODS~~~~~~~~~~~~~~#
 
-    def _get_first_line_header (self, fp):
+    def _get_first_line_header(self, fp):
         header_line = next(fp)
         header_list = header_line.rstrip().split(self._sep)
         return header_list
 
-    def _parse_line (self, line):
+    def _parse_line(self, line):
 
         # Split line
         line = line.rstrip().split(self._sep)
@@ -167,12 +169,12 @@ def _parse_line (self, line):
         # Return parsed line as a dict
         return line_d
 
-    def _open_files (self, fn_list):
+    def _open_files(self, fn_list):
         """Transparently open files, lists, regex, gzipped or not"""
         f_list = []
 
         # Standard input
-        if fn_list is 0:
+        if fn_list == 0:
             fn = "stdin"
             fp = open(0)
             return [(fn,fp)]
@@ -184,7 +186,7 @@ def _open_files (self, fn_list):
         if isinstance(fn_list, (list, tuple, set)):
             for fn_regex in fn_list:
                 for fn in iglob(fn_regex):
-                    self.counter["Input files"]+=1
+                    self.counter["Input files"] += 1
                     if fn.endswith(".gz"):
                         logger.debug("Opening file {} in gzip mode".format(fn))
                         fp = gzip.open(fn, "rt")
@@ -192,9 +194,7 @@ def _open_files (self, fn_list):
                         logger.debug("Opening file {} in normal mode".format(fn))
                         fp = open(fn, "r")
                     f_list.append((fn,fp))
-
             return f_list
-
         else:
             raise SuperParserError ("Invalid file type")
 

From 5980637ed78fa42814f5187237bbed69c5644003 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Thu, 26 Aug 2021 12:51:18 +0100
Subject: [PATCH 48/49] Whitelist: add to-do comment

---
 nanocompore/Whitelist.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nanocompore/Whitelist.py b/nanocompore/Whitelist.py
index 70deb43..00e82d0 100755
--- a/nanocompore/Whitelist.py
+++ b/nanocompore/Whitelist.py
@@ -199,6 +199,7 @@ def __select_ref(self,
         min_ref_length,
         downsample_high_coverage):
         """Select ref_id with a minimal coverage in both sample + downsample if needed"""
+        # TODO: replace 'OrderedDict' with 'dict' for improved performance?
         valid_ref_reads = OrderedDict()
         c = Counter()
         with Fasta(self._fasta_fn) as fasta:
@@ -222,7 +223,7 @@ def __select_ref(self,
 
                     # If all valid add to new dict
                     logger.trace(f"ref_id {ref_id} has enough coverage in all samples: keeping it")
-                    valid_ref_reads [ref_id] = valid_dict
+                    valid_ref_reads[ref_id] = valid_dict
 
                     # Save extra info for debug
                     c["valid_ref_id"] += 1

From 314bf1f573ea862a5e15be2937d351745622a782 Mon Sep 17 00:00:00 2001
From: Hendrik Weisser <hendrik.weisser@stormtherapeutics.com>
Date: Thu, 26 Aug 2021 12:55:04 +0100
Subject: [PATCH 49/49] DataStore: reduce file size of 'eventalign_collapse'
 output DB (by about 20%)

---
 nanocompore/DataStore.py | 63 +++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/nanocompore/DataStore.py b/nanocompore/DataStore.py
index d986064..73547c9 100644
--- a/nanocompore/DataStore.py
+++ b/nanocompore/DataStore.py
@@ -5,7 +5,7 @@
 import os
 import sqlite3
 import contextlib
-from itertools import zip_longest
+from itertools import zip_longest, product
 
 # Third party
 from loguru import logger
@@ -103,22 +103,33 @@ class DataStore_EventAlign(DataStore):
                        "FOREIGN KEY(sampleid) REFERENCES samples(id)",
                        "FOREIGN KEY(transcriptid) REFERENCES transcripts(id)"]
 
+    # "kmer_sequences" table:
+    table_def_kmer_seqs = ["id INTEGER NOT NULL PRIMARY KEY",
+                           "sequence VARCHAR NOT NULL UNIQUE"]
+
+    # "kmer_status" table:
+    table_def_kmer_status = ["id INTEGER NOT NULL PRIMARY KEY",
+                             "status VARCHAR NOT NULL UNIQUE"]
+
     # "kmers" table:
+    # TODO: is combination of "readid" and "position" unique per kmer?
+    # if so, use those as combined primary key (for access efficiency)?
     table_def_kmers = ["id INTEGER NOT NULL PRIMARY KEY",
                        "readid INTEGER NOT NULL",
                        "position INTEGER NOT NULL",
-                       "sequence INTEGER NOT NULL",
-                       "num_events INTEGER NOT NULL",
-                       "num_signals INTEGER NOT NULL",
-                       "status VARCHAR NOT NULL",
+                       "sequenceid INTEGER",
+                       # "sequence VARCHAR NOT NULL",
+                       # "num_events INTEGER NOT NULL",
+                       # "num_signals INTEGER NOT NULL",
+                       "statusid INTEGER NOT NULL",
                        "dwell_time REAL NOT NULL",
-                       "NNNNN_dwell_time REAL NOT NULL",
-                       "mismatch_dwell_time REAL NOT NULL",
+                       # "NNNNN_dwell_time REAL NOT NULL",
+                       # "mismatch_dwell_time REAL NOT NULL",
                        "median REAL NOT NULL",
                        "mad REAL NOT NULL",
-                       "FOREIGN KEY(readid) REFERENCES reads(id)"]
-    # TODO: 'sequence' is stored redundantly - move it to a separate table
-    # TODO: encode 'status' as int to save space (foreign key referencing a table with all possible statuses)
+                       "FOREIGN KEY(readid) REFERENCES reads(id)",
+                       "FOREIGN KEY(sequenceid) REFERENCES kmer_sequences(id)",
+                       "FOREIGN KEY(statusid) REFERENCES kmer_status(id)"]
 
     # "samples" table:
     table_def_samples = ["id INTEGER NOT NULL PRIMARY KEY",
@@ -130,10 +141,32 @@ class DataStore_EventAlign(DataStore):
                              "name VARCHAR NOT NULL UNIQUE"]
 
     table_defs = {"reads": table_def_reads,
+                  "kmer_sequences": table_def_kmer_seqs,
+                  "kmer_status": table_def_kmer_status,
                   "kmers": table_def_kmers,
                   "samples": table_def_samples,
                   "transcripts": table_def_transcripts}
 
+    status_mapping = {"valid": 0, "NNNNN": 1, "mismatch": 2}
+    sequence_mapping = {} # filled by "__init__"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        ## set up mapping table for sequences:
+        self.sequence_mapping = {}
+        seq_prod = product(["A", "C", "G", "T"], repeat=5)
+        for i, seq in enumerate(seq_prod):
+            self.sequence_mapping["".join(seq)] = i
+
+    def _init_db(self):
+        super()._init_db()
+        ## fill "kmer_status" and "kmer_sequences" tables:
+        self._cursor.executemany("INSERT INTO kmer_status VALUES (?, ?)",
+                                 [(i, x) for x, i in self.status_mapping.items()])
+        self._cursor.executemany("INSERT INTO kmer_sequences VALUES (?, ?)",
+                                 [(i, x) for x, i in self.sequence_mapping.items()])
+        self._connection.commit()
+
     def store_read(self, read):
         """
         Insert data corresponding to a read into the DB.
@@ -168,10 +201,12 @@ def __store_kmer(self, kmer, read_id):
         """
         res = kmer.get_results() # needed for 'median' and 'mad' values
         try:
-            self._cursor.execute("INSERT INTO kmers VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
-                              (read_id, res["ref_pos"], res["ref_kmer"], res["num_events"],
-                               res["num_signals"], res["status"], res["dwell_time"],
-                               res["NNNNN_dwell_time"], res["mismatch_dwell_time"], res["median"], res["mad"]))
+            status_id = self.status_mapping[res["status"]]
+            # in case of unexpected kmer seq., this should give None (NULL in the DB):
+            seq_id = self.sequence_mapping.get(res["ref_kmer"])
+            self._cursor.execute("INSERT INTO kmers VALUES(NULL, ?, ?, ?, ?, ?, ?, ?)",
+                                 (read_id, res["ref_pos"], seq_id, status_id,
+                                  res["dwell_time"], res["median"], res["mad"]))
         except Exception:
             logger.error("Error inserting kmer into database")
             raise Exception