From 42eb297278b256ddf01c140fde8ebd3708f09d39 Mon Sep 17 00:00:00 2001
From: Jonas Schaub <44881147+JonasSchaub@users.noreply.github.com>
Date: Mon, 2 Dec 2024 16:41:53 +0100
Subject: [PATCH] WIP, adjusts the amine and cyclic amines cleavage rules so
 that they do not match cleavage results of previous rounds and lead to a
 forever loop; adds test structure described in RDKit doc and test structure
 from the RECAP paper, which helped elucidating the infinity loop bug

---
 .../model/fragmentation/algorithm/RECAP.java  | 38 ++++++++----
 .../fragmentation/algorithm/RECAPTest.java    | 62 ++++++++++++++++++-
 2 files changed, 86 insertions(+), 14 deletions(-)
diff --git a/src/main/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAP.java b/src/main/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAP.java
index 4012150b..47a7d177 100644
--- a/src/main/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAP.java
+++ b/src/main/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAP.java
@@ -127,12 +127,13 @@ private String getSmirksCode() {
     //TODO make individual rules able to be turned off and on?
     //TODO give option to add rules
     //TODO check whether atoms, bonds, etc are copied or whether they are new
-
+    //TODO add methods that return meaningful fragment sets, not the whole hierarchy
+    //TODO discard unused methods
     /**
      *
      */
     public RECAP() {
-
+        //TODO initialise default minimum fragment size of 5
     }
 
     /**
@@ -159,12 +160,12 @@ public List<IAtomContainer> fragment(IAtomContainer mol, boolean includeIntermed
         return includeIntermediates? state.applyTransformationsWithAllIntermediates(mol, minimumFragmentSize) : state.applyTransformationsSinglePass(mol, minimumFragmentSize);
     }
 
-    public HierarchyNode buildHierarchy(IAtomContainer mol) {
+    public HierarchyNode buildHierarchy(IAtomContainer mol) throws CDKException {
         State state = new State();
         return state.buildHierarchy(mol, 5);
     }
 
-    public HierarchyNode buildHierarchy(IAtomContainer mol, int minimumFragmentSize) {
+    public HierarchyNode buildHierarchy(IAtomContainer mol, int minimumFragmentSize) throws CDKException {
         State state = new State();
         return state.buildHierarchy(mol, minimumFragmentSize);
     }
@@ -260,6 +261,12 @@ private void collectAllDescendants(List<HierarchyNode> childrenList, boolean onl
      * final: Prevents the class from being extended.
      */
     private static final class State {
+        /*
+        * Notes on SMIRKS/SMARTS:
+        * - when using the any-atom "*", be aware that it also matches pseudo (R)
+        *   atoms; so think about avoiding this by adding ";!#0" in the any-atom
+        *   definition
+        * */
         /**
          * 1 = Amide -> aliphatic C that is NOT connected to two N (as not to
          * match urea) (index 1), connected via a non-ring double bond to an
@@ -270,7 +277,9 @@ private static final class State {
          * result is an aldehyde and an amine, not a carboxylic acid or ester
          * and an amine -> note also that the atoms can potentially be in a
          * ring, just not the bonds
+         * TODO: insert ";!$([#7][#0]);!$([#7]([#0])[#0])" was inserted to avoid matching pseudo atoms that resulted from a previous cleavage?
          */
+        //private final CleavageRule amide = new CleavageRule("[C;!$(C([#7])[#7]):1](=!@[O:2])!@[#7;+0;!D1;!$([#7][#0]);!$([#7]([#0])[#0]):3]", "*[C:1]=[O:2].*[#7:3]", "Amide");
         private final CleavageRule amide = new CleavageRule("[C;!$(C([#7])[#7]):1](=!@[O:2])!@[#7;+0;!D1:3]", "*[C:1]=[O:2].*[#7:3]", "Amide");
         /**
          * 2 = Ester -> aliphatic C (index 1), connected via a non-ring double
@@ -283,6 +292,7 @@ private static final class State {
          */
         private final CleavageRule ester = new CleavageRule("[C:1](=!@[O:2])!@[O;+0:3]", "*[C:1]=[O:2].[O:3]*", "Ester");
         //TODO does this also work for tertiary amines? I guess it matches multiple times?
+        //TODO this does not align with the RECAP paper definition of the amine cleavage rule, i.e. it does not match the example structure; see also additional rule "cyclic amines" below
         /**
          * 3 = Amine -> aliphatic N with a neutral charge and a degree of NOT 1
          * that is also NOT connected to a C connected via a double bond to N,
@@ -292,9 +302,10 @@ private static final class State {
          * 2, each connected to any other atom -> note that the amine / N is
          * discarded! -> note also that the atoms can potentially be in a ring,
          * just not the bonds -> simpler alternative would be (without excluding
-         * any sort of amides): [N;!D1](!@[*:1])!@[*:2]>>*[*:1].[*:2]*
+         * any sort of amines): [N;!D1](!@[*:1])!@[*:2]>>*[*:1].[*:2]*
+         * ";!#0" was added for the two any-atoms to avoid matching pseudo atoms that resulted from a previous cleavage
          */
-        private final CleavageRule amine = new CleavageRule("[N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*:1])-!@[*:2]", "*[*:1].[*:2]*", "Amine");
+        private final CleavageRule amine = new CleavageRule("[N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*;!#0:1])-!@[*;!#0:2]", "*[*:1].[*:2]*", "Amine");
         /**
          * 4 = Urea -> aliphatic or aromatic(!) N with a neutral charge and a
          * degree of 2 or 3 (index 1), connected via a non-ring bond to an
@@ -376,8 +387,9 @@ private static final class State {
          * bond to any atom (index 2) reacts to the N connected to any atom and
          * the other atom connected to any atom note that no assumption is made
          * as to how the structure was synthesized
+         * ";!#0" was inserted to avoid matching pseudo atoms that resulted from a previous cleavage
          */
-        private final CleavageRule cyclicAmines = new CleavageRule("[#7;R;D3;+0:1]-!@[*:2]", "*[#7:1].[*:2]*", "Cyclic amines");
+        private final CleavageRule cyclicAmines = new CleavageRule("[#7;R;D3;+0:1]-!@[*;!#0:2]", "*[#7:1].[*:2]*", "Cyclic amines");
         //TODO this is not part of the original RECAP, make it optional?
         /**
          * S2 = Aromatic nitrogen - aromatic carbon -> aromatic N with a neutral
@@ -398,13 +410,13 @@ private static final class State {
                 this.aromaticNitrogenToAromaticCarbon};
 
         /**
-         * 
+         *
          *
          * @param inputMol
          * @param minimumFragmentSize
          * @return
          */
-        private HierarchyNode buildHierarchy(IAtomContainer inputMol, int minimumFragmentSize) {
+        private HierarchyNode buildHierarchy(IAtomContainer inputMol, int minimumFragmentSize) throws CDKException {
             HierarchyNode inputMolNode = new HierarchyNode(inputMol);
             Queue<HierarchyNode> queue = new LinkedList<>();
             queue.add(inputMolNode);
@@ -413,8 +425,7 @@ private HierarchyNode buildHierarchy(IAtomContainer inputMol, int minimumFragmen
                 for (CleavageRule rule : this.cleavageRules) {
                     if (rule.getEductPattern().matches(currentNode.getStructure())) {
                         //mode unique returns as many products as there are splittable bonds, so one product for every bond split
-                        Iterable<IAtomContainer> products = rule.getTransformation().apply(currentNode.getStructure(), Transform.Mode.Unique);
-                        for (IAtomContainer product : products) {
+                        for (IAtomContainer product : rule.getTransformation().apply(currentNode.getStructure(), Transform.Mode.Unique)) {
                             List<IAtomContainer> parts = new ArrayList<>();
                             boolean containsForbiddenFragment = false;
                             for (IAtomContainer part : ConnectivityChecker.partitionIntoMolecules(product).atomContainers()) {
@@ -425,7 +436,10 @@ private HierarchyNode buildHierarchy(IAtomContainer inputMol, int minimumFragmen
                                 }
                             }
                             if (!containsForbiddenFragment) {
+                                //Logger.getLogger(RECAP.class.getName()).log(Level.INFO, "Transformation rule " + rule.getName() + " matched and produced valid fragments.");
+                                //Logger.getLogger(RECAP.class.getName()).log(Level.INFO, "Educt " + (new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols)).create(currentNode.getStructure()));
                                 for (IAtomContainer part : parts) {
+                                    //Logger.getLogger(RECAP.class.getName()).log(Level.INFO, "Product " + (new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols)).create(part));
                                     HierarchyNode newNode = new HierarchyNode(part);
                                     newNode.getParents().add(currentNode);
                                     currentNode.getChildren().add(newNode);
@@ -686,7 +700,7 @@ private boolean isFragmentForbidden(IAtomContainer mol, int minimumFragmentSize)
                 return false;
             }
         }
-
+        //TODO: in the CDK integration, use the corresponding methods of functionalgroupsfinder or put them in a central utility class
         /**
          * Checks whether the given atom is a pseudo atom. Very strict, any atom
          * whose atomic number is null or 0, whose symbol equals "R" or "*", or that
diff --git a/src/test/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAPTest.java b/src/test/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAPTest.java
index ab86ad97..7d8f98ba 100644
--- a/src/test/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAPTest.java
+++ b/src/test/java/de/unijena/cheminf/mortar/model/fragmentation/algorithm/RECAPTest.java
@@ -33,13 +33,15 @@
 import org.openscience.cdk.graph.CycleFinder;
 import org.openscience.cdk.graph.Cycles;
 import org.openscience.cdk.interfaces.IAtomContainer;
-import org.openscience.cdk.isomorphism.Transform;
 import org.openscience.cdk.smiles.SmiFlavor;
 import org.openscience.cdk.smiles.SmilesGenerator;
 import org.openscience.cdk.smiles.SmilesParser;
-import org.openscience.cdk.smirks.Smirks;
 import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
 
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
 class RECAPTest extends RECAP {
     @Test
     void test1() throws Exception {
@@ -91,4 +93,60 @@ void test1() throws Exception {
         Assertions.assertTrue(node.getParents().isEmpty());
     }
 
+    @Test
+    void rdkitDocExampleTest() throws Exception {
+        SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
+        IAtomContainer mol = smiPar.parseSmiles("C1CC1Oc1ccccc1-c1ncc(OC)cc1");
+        AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
+        CycleFinder cycles = Cycles.cdkAromaticSet();
+        Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
+        cycles.find(mol);
+        arom.apply(mol);
+        RECAP recap = new RECAP();
+        HierarchyNode node = recap.buildHierarchy(mol);
+        SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
+        Set<String> directChildrenSmilesSet = HashSet.newHashSet(node.getChildren().size());
+        for (HierarchyNode child : node.getChildren()) {
+            directChildrenSmilesSet.add(smiGen.create(child.getStructure()));
+        }
+        Assertions.assertEquals(4, directChildrenSmilesSet.size());
+        //corresponds to SMILES codes given in RDKit doc
+        Assertions.assertTrue(directChildrenSmilesSet.containsAll(List.of("*C1CC1", "*c1ncc(OC)cc1", "*c1ccccc1-c2ncc(OC)cc2", "*c1ccccc1OC2CC2")));
+        Set<String> allDescendantsSmilesSet = HashSet.newHashSet(node.getAllDescendants().size());
+        for (HierarchyNode child : node.getAllDescendants()) {
+            allDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
+        }
+        Assertions.assertEquals(5, allDescendantsSmilesSet.size());
+        //corresponds to SMILES codes given in RDKit doc
+        Assertions.assertTrue(allDescendantsSmilesSet.containsAll(List.of("*C1CC1", "*c1ncc(OC)cc1", "*c1ccccc1*", "*c1ccccc1-c2ncc(OC)cc2", "*c1ccccc1OC2CC2")));
+        Set<String> terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
+        for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
+            terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
+        }
+        Assertions.assertEquals(3, terminalDescendantsSmilesSet.size());
+        //corresponds to SMILES codes given in RDKit doc
+        Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*C1CC1", "*c1ncc(OC)cc1", "*c1ccccc1*")));
+    }
+
+    @Test
+    void recapPaperExampleTest() throws Exception {
+        SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
+        IAtomContainer mol = smiPar.parseSmiles("FC1=CC=C(OCCCN2CCC(NC(C3=CC(Cl)=C(N)C=C3OC)=O)C(OC)C2)C=C1");
+        AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
+        CycleFinder cycles = Cycles.cdkAromaticSet();
+        Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
+        cycles.find(mol);
+        arom.apply(mol);
+        RECAP recap = new RECAP();
+        HierarchyNode node = recap.buildHierarchy(mol);
+        SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
+        Set<String> terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
+        for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
+            terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
+        }
+        Assertions.assertEquals(4, terminalDescendantsSmilesSet.size());
+        //corresponds to RECAP paper, except for the Fluorphenole described there,
+        // but it is not described how this came to be
+        Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*c1ccc(F)cc1", "*CCC*", "*NC1CCN(*)CC1OC", "*C(=O)c1cc(Cl)c(N)cc1OC")));
+    }
 }