Skip to content

Commit

Permalink
WIP, added more tests taken from RDKit
Browse files Browse the repository at this point in the history
  • Loading branch information
JonasSchaub committed Dec 4, 2024
1 parent 42eb297 commit 757bffe
Show file tree
Hide file tree
Showing 2 changed files with 240 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,15 @@ private String getSmirksCode() {
}
}
//TODO implement tests from RECAP paper and RDKit
//TODO RDKit generate a mapping of SMILES code to hierarchy node to reduce the search space (deduplication)
// and be able to add more molecules and their fragments into the map (but not the hierarchy)!
//TODO make individual rules able to be turned off and on?
//TODO give option to add rules
//TODO check whether atoms, bonds, etc are copied or whether they are new
//TODO add methods that return meaningful fragment sets, not the whole hierarchy
//TODO how to ensure fragments appearing multiple times in the molecule appear multiple times in a deduplicated fragment set?
//TODO discard unused methods
//TODO track information which cleavage rules were applied?
/**
*
*/
Expand Down Expand Up @@ -272,12 +276,13 @@ private static final class State {
* match urea) (index 1), connected via a non-ring double bond to an
* aliphatic O (index 2), connected via a non-ring bond to N with a
* neutral charge and a degree of not 1, can be aliphatic or aromatic
* (index 3) -> reacts to C index 1 connected to O index 2 and to any
* other atom and N index 3 connected to any other atom -> note that the
* (index 3) -> reacts to C index 1 connected to O index 2 and to an R
* atom and N index 3 connected to an R atom -> note that the
* result is an aldehyde and an amine, not a carboxylic acid or ester
* and an amine -> note also that the atoms can potentially be in a
* ring, just not the bonds
* TODO: insert ";!$([#7][#0]);!$([#7]([#0])[#0])" was inserted to avoid matching pseudo atoms that resulted from a previous cleavage?
* TODO: insert ";!$([#7][#0]);!$([#7]([#0])[#0])" to avoid matching pseudo atoms that resulted from a previous cleavage?
* TODO: transform this into carboxy acid and amine? Right now, we get aldehyde and amine
*/
//private final CleavageRule amide = new CleavageRule("[C;!$(C([#7])[#7]):1](=!@[O:2])!@[#7;+0;!D1;!$([#7][#0]);!$([#7]([#0])[#0]):3]", "*[C:1]=[O:2].*[#7:3]", "Amide");
private final CleavageRule amide = new CleavageRule("[C;!$(C([#7])[#7]):1](=!@[O:2])!@[#7;+0;!D1:3]", "*[C:1]=[O:2].*[#7:3]", "Amide");
Expand Down Expand Up @@ -305,7 +310,8 @@ private static final class State {
* any sort of amines): [N;!D1](!@[*:1])!@[*:2]>>*[*:1].[*:2]*
* ";!#0" was added for the two any-atoms to avoid matching pseudo atoms that resulted from a previous cleavage
*/
private final CleavageRule amine = new CleavageRule("[N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*;!#0:1])-!@[*;!#0:2]", "*[*:1].[*:2]*", "Amine");
//TODO test this further
private final CleavageRule amine = new CleavageRule("[N;!D1;+0;!$(N-C=[#7,#8,#15,#16]);!$()](-!@[*;!#0:1])-!@[*;!#0:2]", "*[*:1].[*:2]*", "Amine");
/**
* 4 = Urea -> aliphatic or aromatic(!) N with a neutral charge and a
* degree of 2 or 3 (index 1), connected via a non-ring bond to an
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

class RECAPTest extends RECAP {
@Test
void test1() throws Exception {
void testMinimumFragmentSizeChanges() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("C1CC1Oc1ccccc1-c1ncc(OC)cc1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
Expand All @@ -66,6 +66,7 @@ void test1() throws Exception {
for (HierarchyNode childThirdLevel : childSecondLevel.getChildren()) {
i++;
System.out.println(i + ":\t\t\t " + smiGen.create(childThirdLevel.getStructure()));
Assertions.assertTrue(childThirdLevel.isTerminal());
}
}
}
Expand Down Expand Up @@ -93,6 +94,12 @@ void test1() throws Exception {
Assertions.assertTrue(node.getParents().isEmpty());
}

/**
* Corresponds to the example given in the doc of the RDKit implementation
* and their first test.
*
* @throws Exception
*/
@Test
void rdkitDocExampleTest() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
Expand All @@ -105,20 +112,23 @@ void rdkitDocExampleTest() throws Exception {
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);

Set<String> directChildrenSmilesSet = HashSet.newHashSet(node.getChildren().size());
for (HierarchyNode child : node.getChildren()) {
directChildrenSmilesSet.add(smiGen.create(child.getStructure()));
}
Assertions.assertEquals(4, directChildrenSmilesSet.size());
//corresponds to SMILES codes given in RDKit doc
Assertions.assertTrue(directChildrenSmilesSet.containsAll(List.of("*C1CC1", "*c1ncc(OC)cc1", "*c1ccccc1-c2ncc(OC)cc2", "*c1ccccc1OC2CC2")));

Set<String> allDescendantsSmilesSet = HashSet.newHashSet(node.getAllDescendants().size());
for (HierarchyNode child : node.getAllDescendants()) {
allDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
}
Assertions.assertEquals(5, allDescendantsSmilesSet.size());
//corresponds to SMILES codes given in RDKit doc
Assertions.assertTrue(allDescendantsSmilesSet.containsAll(List.of("*C1CC1", "*c1ncc(OC)cc1", "*c1ccccc1*", "*c1ccccc1-c2ncc(OC)cc2", "*c1ccccc1OC2CC2")));

Set<String> terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
Expand Down Expand Up @@ -149,4 +159,223 @@ void recapPaperExampleTest() throws Exception {
// but it is not described how this came to be
Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*c1ccc(F)cc1", "*CCC*", "*NC1CCN(*)CC1OC", "*C(=O)c1cc(Cl)c(N)cc1OC")));
}
/**
* Corresponds to RDKit implementation test 2.
*
* @throws Exception
*/
@Test
void testDipropylEther() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("CCCOCCC");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
// with the default minimum fragment size = 5, this molecule should not be cleaved
Assertions.assertTrue(node.getChildren().isEmpty());
node = recap.buildHierarchy(mol, 3);
SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
int i = 0;
System.out.println(i + ": " + smiGen.create(node.getStructure()));
for (HierarchyNode childFirstLevel : node.getChildren()) {
i++;
System.out.println(i + ":\t " + smiGen.create(childFirstLevel.getStructure()));
}
// now, it is cleaved into two times *CCC
Assertions.assertEquals(2, node.getChildren().size());
}
/**
*
*/
@Test
void test2PhenylPyridine() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("c1ccccc1-c1ncccc1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
Assertions.assertEquals(2, node.getChildren().size());
}
/**
*
*/
@Test
void testTriphenoxymethane() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("c1ccccc1OC(Oc1ccccc1)Oc1ccccc1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
Set<String> terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
}
Assertions.assertEquals(2, terminalDescendantsSmilesSet.size());
// major difference to RDKit implementation which filters also small
// non-terminal(!) alkyl fragments (so forbids *C(*)*); it reports
// fragments *c1ccccc1 and *C(*)Oc1ccccc1
Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*C(*)*", "*c1ccccc1")));
}
/**
*
*/
@Test
void test1Butylpiperidine() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("C1CCCCN1CCCC");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
//another difference from the RDKit implementation which filters all
// simple alkyls up to propyl (not butyl like in the RECAP paper!) if no
// minimum fragment size is defined (default set to 0)
Assertions.assertEquals(0, node.getChildren().size());
}
/**
*
*/
@Test
void testMinFragmentSize() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("CCCOCCC");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
//no cleavage with default minimum fragment size 5 carbons
Assertions.assertTrue(node.getChildren().isEmpty());
node = recap.buildHierarchy(mol, 3);
//but successful cleavage with min fragment size of 3
Assertions.assertEquals(2, node.getChildren().size());
SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
Assertions.assertEquals("*CCC", smiGen.create(node.getChildren().getFirst().getStructure()));
//one side only ethyl now
mol = smiPar.parseSmiles("CCOCCC");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
cycles.find(mol);
arom.apply(mol);
node = recap.buildHierarchy(mol, 3);
//no cleavage even with minimum fragment size 3 carbons because one
// fragment would be forbidden
Assertions.assertTrue(node.getChildren().isEmpty());
mol = smiPar.parseSmiles("CCCOCCOC");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
cycles.find(mol);
arom.apply(mol);
node = recap.buildHierarchy(mol, 2);
//one cleavage with minimum fragment size 2 carbons is possible now but
// the other ether stays intact, *CCC and *CCOC
Assertions.assertEquals(2, node.getChildren().size());
//all in agreement with RDKit
}
/**
*
*/
@Test
void noMatchTest() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("C1CC1C(=O)CC1OC1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
Assertions.assertTrue(node.getChildren().isEmpty());

mol = smiPar.parseSmiles("C1CCC(=O)NC1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
cycles.find(mol);
arom.apply(mol);
node = recap.buildHierarchy(mol);
Assertions.assertTrue(node.getChildren().isEmpty());
}
/**
*
*/
@Test
void testAmideRule() throws Exception {
SmilesParser smiPar = new SmilesParser(DefaultChemObjectBuilder.getInstance());
IAtomContainer mol = smiPar.parseSmiles("C1CC1C(=O)NC1OC1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
CycleFinder cycles = Cycles.cdkAromaticSet();
Aromaticity arom = new Aromaticity(ElectronDonation.cdk(), cycles);
cycles.find(mol);
arom.apply(mol);
RECAP recap = new RECAP();
HierarchyNode node = recap.buildHierarchy(mol);
SmilesGenerator smiGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
Set<String> terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
}
Assertions.assertEquals(2, terminalDescendantsSmilesSet.size());
Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*C(=O)C1CC1", "*NC1OC1")));

// N is now of degree three because of additional methyl group
//TODO the products also match the amine rule
mol = smiPar.parseSmiles("C1CC1C(=O)N(C)C1OC1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
cycles.find(mol);
arom.apply(mol);
node = recap.buildHierarchy(mol);
terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
}
Assertions.assertEquals(2, terminalDescendantsSmilesSet.size());
Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*C(=O)C1CC1", "*N(C)C1OC1")));

//TODO this also matches aromatic nitrogen to aliphatic carbon and cyclic amine, is this a problem?
mol = smiPar.parseSmiles("C1CC1C(=O)n1cccc1");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
cycles.find(mol);
arom.apply(mol);
node = recap.buildHierarchy(mol);
terminalDescendantsSmilesSet = HashSet.newHashSet(node.getOnlyTerminalDescendants().size());
for (HierarchyNode child : node.getOnlyTerminalDescendants()) {
terminalDescendantsSmilesSet.add(smiGen.create(child.getStructure()));
}
Assertions.assertEquals(2, terminalDescendantsSmilesSet.size());
Assertions.assertTrue(terminalDescendantsSmilesSet.containsAll(List.of("*C(=O)C1CC1", "*n1cccc1")));


mol = smiPar.parseSmiles("CC(=O)N");
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
cycles.find(mol);
arom.apply(mol);
node = recap.buildHierarchy(mol);
Assertions.assertTrue(node.getChildren().isEmpty());

//CC(=O)NC
//C(=O)NCCNC(=O)CC

// int i = 0;
// System.out.println(i + ": " + smiGen.create(node.getStructure()));
// for (HierarchyNode childFirstLevel : node.getChildren()) {
// i++;
// System.out.println(i + ":\t " + smiGen.create(childFirstLevel.getStructure()));
// }
}
}

0 comments on commit 757bffe

Please sign in to comment.