From afac4f01a3fff5dfa0a3a54fc608fda1a0e610e9 Mon Sep 17 00:00:00 2001 From: Adam Taranto Date: Sat, 14 Sep 2024 20:22:45 +1000 Subject: [PATCH 1/3] Add methods for removing records from count table. --- src/lib.rs | 83 ++++++++++++++++++++++-- src/python/tests/test_remove.py | 108 +++++++++++++++++++++++++++----- 2 files changed, 171 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0c3afba..674e034 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -98,15 +98,86 @@ impl KmerCountTable { hash_keys.iter().map(|&key| self.get_hash(key)).collect() } - // TODO: Add method "drop" - // remove kmer from table + /// Drop a k-mer from the count table by its string representation + pub fn drop(&mut self, kmer: String) -> PyResult<()> { + // Ensure that the k-mer length matches the table's ksize + if kmer.len() as u8 != self.ksize { + // Return an error if the lengths do not match + Err(PyValueError::new_err( + "kmer size does not match count table ksize", + )) + } else { + // Compute the hash of the k-mer using the same method used for counting + let hashval = self.hash_kmer(kmer).unwrap(); + + // Attempt to remove the k-mer's hash from the counts HashMap + if self.counts.remove(&hashval).is_some() { + // If the k-mer was successfully removed, return Ok + debug!("K-mer with hashval {} removed from table", hashval); + Ok(()) + } else { + // If the k-mer was not found, return Ok without an error + debug!("K-mer with hashval {} not found in table", hashval); + Ok(()) + } + } + } - // TODO: Add method "drop_hash" - // remove hash from table + /// Drop a k-mer from the count table by its hash value + pub fn drop_hash(&mut self, hashval: u64) -> PyResult<()> { + // Attempt to remove the hash value from the counts HashMap + if self.counts.remove(&hashval).is_some() { + // If the hash value was successfully removed, log and return Ok + debug!("Hash value {} removed from table", hashval); + Ok(()) + } else { + // If the hash value was not found, log and return Ok without error + debug!("Hash value {} not found in table", hashval); + Ok(()) + } + } - // TODO: Add "mincut". Remove counts below a minimum cutoff. + /// Remove all k-mers with counts less than a given threshold + pub fn mincut(&mut self, min_count: u64) -> PyResult { + // Create a vector to store the keys (hashes) to be removed + let mut to_remove = Vec::new(); - // TODO: Add "maxcut". Remove counts above an maximum cutoff. + // Iterate over the HashMap and identify keys with counts less than the threshold + for (&hash, &count) in self.counts.iter() { + if count < min_count { + to_remove.push(hash); + } + } + + // Remove the identified keys from the counts HashMap + for &hash in &to_remove { + self.counts.remove(&hash); + } + + // Return the number of k-mers removed + Ok(to_remove.len() as u64) + } + + /// Remove all k-mers with counts greater than a given threshold + pub fn maxcut(&mut self, max_count: u64) -> PyResult { + // Create a vector to store the keys (hashes) to be removed + let mut to_remove = Vec::new(); + + // Iterate over the HashMap and identify keys with counts greater than the threshold + for (&hash, &count) in self.counts.iter() { + if count > max_count { + to_remove.push(hash); + } + } + + // Remove the identified keys from the counts HashMap + for &hash in &to_remove { + self.counts.remove(&hash); + } + + // Return the number of k-mers removed + Ok(to_remove.len() as u64) + } // TODO: Serialize the KmerCountTable instance to a JSON string. diff --git a/src/python/tests/test_remove.py b/src/python/tests/test_remove.py index b129063..d91267a 100644 --- a/src/python/tests/test_remove.py +++ b/src/python/tests/test_remove.py @@ -1,19 +1,99 @@ -import oxli import pytest -from test_basic import create_sample_kmer_table +import oxli + +@pytest.fixture +def setup_kmer_table(): + """Fixture to set up a KmerCountTable with ksize=4 and some initial k-mers""" + kct = oxli.KmerCountTable(ksize=4) + kct.count("AAAA") # Hash of canonical form will be used (AAAA) + kct.count("CCCC") # CCCC + kct.count("ATAT") # ATAT + kct.count("GGGG") # Should map to CCCC + kct.count("TTTT") # Should map to AAAA + kct.count("CCCC") # Increment count for CCCC/GGGG + # AAAA/TTTT = 2 + # ATAT = 1 + # CCCC/GGGG = 3 + return kct + +def test_drop(setup_kmer_table): + """ + Test the drop method to remove a k-mer by its string representation. + Edge case: Dropping a k-mer that doesn't exist. + """ + kct = setup_kmer_table + + # Drop "GGGG" which exists, and check it's removed + kct.drop("GGGG") + assert kct.get("GGGG") == 0, "Expected 'GGGG' to be removed." + + # Drop "AAAA", should remove both "AAAA" and "TTTT" (same canonical form) + kct.drop("AAAA") + assert kct.get("AAAA") == 0, "Expected 'AAAA' (and 'TTTT') to be removed." + + # Edge case: Drop a k-mer that doesn't exist, e.g., "GGGA" + kct.drop("GGGA") # "GGGA" not present in the table + assert kct.get("GGGA") == 0 # "GGGA" not present in the table + + # Raise error if kmer longer than table k len. + with pytest.raises(ValueError): + kct.drop("GGGAA") # "GGGAA" longer than table k + +def test_drop_hash(setup_kmer_table): + """ + Test the drop_hash method to remove a k-mer by its hash. + Edge case: Dropping a hash that doesn't exist. + """ + kct = setup_kmer_table + + # Drop by the hash for "CCCC", and check it's removed + hashval = kct.hash_kmer("CCCC") + kct.drop_hash(hashval) + assert kct.get_hash(hashval) == 0, "Expected 'CCCC' and 'GGGG' to be removed." + assert kct.get("CCCC") == 0, "Expected 'CCCC' to be removed." + assert kct.get("GGGG") == 0, "Expected 'GGGG' to be removed." + + # Edge case: Drop a hash that doesn't exist + non_existent_hash = 999999999 + kct.drop_hash(non_existent_hash) # Should not raise an error + assert kct.get_hash(non_existent_hash) == 0, "Expected non-existent hash removal to succeed." + +def test_mincut(setup_kmer_table): + """ + Test the mincut method to remove all k-mers with counts less than a given threshold. + Edge cases: Threshold is higher than all counts, no k-mers to remove. + """ + kct = setup_kmer_table -def test_drop(): - '''Remove kmer by name.''' - pass + # Set a threshold that only removes k-mers with counts < 2 + removed = kct.mincut(3) + assert removed == 2, "Expected 2 k-mers to be removed ('ATAT' and 'AAAA/TTTT')." + assert kct.get("GGGG") == 3, "Expected 'GGGG/CCCC' to remain." + + # Edge case: Threshold is higher than all k-mer counts (remove everything) + removed = kct.mincut(10) + assert removed == 1, "Expected all remaining k-mers to be removed ('GGGG/CCCC')." + assert len(kct.hashes) == 0, "Expected no k-mers left after removing all." -def test_drop_hash(): - '''Remove record by hash.''' - pass +def test_maxcut(setup_kmer_table): + """ + Test the maxcut method to remove all k-mers with counts greater than a given threshold. + Edge case: Threshold is lower than all counts, no k-mers to remove. + """ + kct = setup_kmer_table -def test_mincut(): - '''Remove all records with counts < threshold. ''' - pass + # Set a threshold that only removes k-mers with counts > 1 (GGGG) + removed = kct.maxcut(2) + assert removed == 1, "Expected 'CCCC/GGGG' to be removed." + assert kct.get("GGGG") == 0, "Expected 'CCCC/GGGG' to be removed." + assert kct.get("AAAA") == 2, "Should not remove kmers with exact maxcut value, only greater." -def test_maxcut(): - '''Remove all records with counts > threshold. ''' - pass + # Edge case: Threshold is higher than all k-mer counts (remove none) + removed = kct.maxcut(10) + assert removed == 0, "Expected no k-mers to be removed since all counts are < 10." + assert len(kct.hashes) == 2, "Expected 2 records with counts < 10 to remain in the table." + + # Edge case: Threshold is lower than all k-mer counts (remove all) + removed = kct.maxcut(0) + assert removed == 2, "Expected no k-mers to be removed since all counts are > 0." + assert len(kct.hashes) == 0, "Expected 0 records with counts < 1 to remain in the table." \ No newline at end of file From 0134c0953a0ec5d2d45408f4e6f7aad8dc3bee7b Mon Sep 17 00:00:00 2001 From: Adam Taranto Date: Sat, 14 Sep 2024 20:23:37 +1000 Subject: [PATCH 2/3] Ruff format --- src/python/tests/test_remove.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/python/tests/test_remove.py b/src/python/tests/test_remove.py index d91267a..88712e5 100644 --- a/src/python/tests/test_remove.py +++ b/src/python/tests/test_remove.py @@ -1,6 +1,7 @@ import pytest import oxli + @pytest.fixture def setup_kmer_table(): """Fixture to set up a KmerCountTable with ksize=4 and some initial k-mers""" @@ -16,6 +17,7 @@ def setup_kmer_table(): # CCCC/GGGG = 3 return kct + def test_drop(setup_kmer_table): """ Test the drop method to remove a k-mer by its string representation. @@ -33,12 +35,13 @@ def test_drop(setup_kmer_table): # Edge case: Drop a k-mer that doesn't exist, e.g., "GGGA" kct.drop("GGGA") # "GGGA" not present in the table - assert kct.get("GGGA") == 0 # "GGGA" not present in the table - + assert kct.get("GGGA") == 0 # "GGGA" not present in the table + # Raise error if kmer longer than table k len. with pytest.raises(ValueError): kct.drop("GGGAA") # "GGGAA" longer than table k + def test_drop_hash(setup_kmer_table): """ Test the drop_hash method to remove a k-mer by its hash. @@ -56,7 +59,10 @@ def test_drop_hash(setup_kmer_table): # Edge case: Drop a hash that doesn't exist non_existent_hash = 999999999 kct.drop_hash(non_existent_hash) # Should not raise an error - assert kct.get_hash(non_existent_hash) == 0, "Expected non-existent hash removal to succeed." + assert ( + kct.get_hash(non_existent_hash) == 0 + ), "Expected non-existent hash removal to succeed." + def test_mincut(setup_kmer_table): """ @@ -69,12 +75,13 @@ def test_mincut(setup_kmer_table): removed = kct.mincut(3) assert removed == 2, "Expected 2 k-mers to be removed ('ATAT' and 'AAAA/TTTT')." assert kct.get("GGGG") == 3, "Expected 'GGGG/CCCC' to remain." - + # Edge case: Threshold is higher than all k-mer counts (remove everything) removed = kct.mincut(10) assert removed == 1, "Expected all remaining k-mers to be removed ('GGGG/CCCC')." assert len(kct.hashes) == 0, "Expected no k-mers left after removing all." + def test_maxcut(setup_kmer_table): """ Test the maxcut method to remove all k-mers with counts greater than a given threshold. @@ -86,14 +93,20 @@ def test_maxcut(setup_kmer_table): removed = kct.maxcut(2) assert removed == 1, "Expected 'CCCC/GGGG' to be removed." assert kct.get("GGGG") == 0, "Expected 'CCCC/GGGG' to be removed." - assert kct.get("AAAA") == 2, "Should not remove kmers with exact maxcut value, only greater." + assert ( + kct.get("AAAA") == 2 + ), "Should not remove kmers with exact maxcut value, only greater." # Edge case: Threshold is higher than all k-mer counts (remove none) removed = kct.maxcut(10) assert removed == 0, "Expected no k-mers to be removed since all counts are < 10." - assert len(kct.hashes) == 2, "Expected 2 records with counts < 10 to remain in the table." - + assert ( + len(kct.hashes) == 2 + ), "Expected 2 records with counts < 10 to remain in the table." + # Edge case: Threshold is lower than all k-mer counts (remove all) removed = kct.maxcut(0) assert removed == 2, "Expected no k-mers to be removed since all counts are > 0." - assert len(kct.hashes) == 0, "Expected 0 records with counts < 1 to remain in the table." \ No newline at end of file + assert ( + len(kct.hashes) == 0 + ), "Expected 0 records with counts < 1 to remain in the table." From ab33957c929250b9262f7f3fd4f47cfdc4a88098 Mon Sep 17 00:00:00 2001 From: Adam Taranto Date: Wed, 18 Sep 2024 12:03:02 +1000 Subject: [PATCH 3/3] Don't check for correct kmer len in drop(). Let hash_kmer() do it. --- src/lib.rs | 29 ++++++++++------------------- src/python/tests/test_remove.py | 4 ---- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 674e034..41a640a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,26 +100,17 @@ impl KmerCountTable { /// Drop a k-mer from the count table by its string representation pub fn drop(&mut self, kmer: String) -> PyResult<()> { - // Ensure that the k-mer length matches the table's ksize - if kmer.len() as u8 != self.ksize { - // Return an error if the lengths do not match - Err(PyValueError::new_err( - "kmer size does not match count table ksize", - )) + // Compute the hash of the k-mer using the same method used for counting + let hashval = self.hash_kmer(kmer)?; + // Attempt to remove the k-mer's hash from the counts HashMap + if self.counts.remove(&hashval).is_some() { + // If the k-mer was successfully removed, return Ok + debug!("K-mer with hashval {} removed from table", hashval); + Ok(()) } else { - // Compute the hash of the k-mer using the same method used for counting - let hashval = self.hash_kmer(kmer).unwrap(); - - // Attempt to remove the k-mer's hash from the counts HashMap - if self.counts.remove(&hashval).is_some() { - // If the k-mer was successfully removed, return Ok - debug!("K-mer with hashval {} removed from table", hashval); - Ok(()) - } else { - // If the k-mer was not found, return Ok without an error - debug!("K-mer with hashval {} not found in table", hashval); - Ok(()) - } + // If the k-mer was not found, return Ok without an error + debug!("K-mer with hashval {} not found in table", hashval); + Ok(()) } } diff --git a/src/python/tests/test_remove.py b/src/python/tests/test_remove.py index 88712e5..3ae8b80 100644 --- a/src/python/tests/test_remove.py +++ b/src/python/tests/test_remove.py @@ -37,10 +37,6 @@ def test_drop(setup_kmer_table): kct.drop("GGGA") # "GGGA" not present in the table assert kct.get("GGGA") == 0 # "GGGA" not present in the table - # Raise error if kmer longer than table k len. - with pytest.raises(ValueError): - kct.drop("GGGAA") # "GGGAA" longer than table k - def test_drop_hash(setup_kmer_table): """