diff --git a/.github/workflows/anaconda.yaml b/.github/workflows/anaconda.yaml new file mode 100644 index 0000000..f5d09ae --- /dev/null +++ b/.github/workflows/anaconda.yaml @@ -0,0 +1,54 @@ +name: Anaconda Build + +on: ['push', 'pull_request', 'workflow_dispatch'] + +jobs: + anaconda_build: + name: Anaconda Build + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Create build environment + uses: conda-incubator/setup-miniconda@v3 + with: + activate-environment: gc-meox-tms-build + auto-update-conda: true + environment-file: conda/environment-build.yaml + python-version: 3.8 + - name: Show conda config + shell: bash -l {0} + run: | + conda info + conda list + conda config --show-sources + conda config --show + conda env list + - name: Python info + shell: bash -l {0} + run: | + which python + python --version + - name: Show environment variables + shell: bash -l {0} + run: | + env | sort + - name: Build the conda package + shell: bash -l {0} + run: | + export BUILDDIR=$RUNNER_TEMP/gc-meox-tms/_build + [ "$RUNNING_OS" = "Windows" ] && export BUILDDIR=$RUNNER_TEMP\\gc-meox-tms\\_build + conda config --set anaconda_upload no + conda build --numpy 1.18.1 --no-include-recipe \ + --channel bioconda --channel conda-forge \ + --croot ${BUILDDIR} \ + ./conda + - name: Upload package artifact from build + uses: actions/upload-artifact@v2 + with: + name: conda-package-artifact + path: ${{ runner.temp }}/gc-meox-tms/_build + retention-days: 1 diff --git a/.github/workflows/project_automation.yml b/.github/workflows/project_automation.yml new file mode 100644 index 0000000..93766fb --- /dev/null +++ b/.github/workflows/project_automation.yml @@ -0,0 +1,11 @@ +name: Add issues to project + +on: + issues: + types: + - opened + +jobs: + call-workflow: + uses: recetox/galaxytools/.github/workflows/add_issue_to_project.yml@master + secrets: inherit diff --git a/.github/workflows/publish_pypi.yaml b/.github/workflows/publish_pypi.yaml new file mode 100644 index 0000000..bd77da4 --- /dev/null +++ b/.github/workflows/publish_pypi.yaml @@ -0,0 +1,26 @@ +name: Publish to PyPI + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + python -m build --sdist --wheel + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1.8 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/python-package-conda.yaml b/.github/workflows/python-package-conda.yaml new file mode 100644 index 0000000..fa27e35 --- /dev/null +++ b/.github/workflows/python-package-conda.yaml @@ -0,0 +1,32 @@ +name: Python Package using Conda + +on: ['push', 'pull_request', 'workflow_dispatch'] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Add conda to PATH + run: | + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda update conda + conda env update --file conda/environment-dev.yaml --name base + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml new file mode 100644 index 0000000..f28e3da --- /dev/null +++ b/.github/workflows/python-package.yaml @@ -0,0 +1,68 @@ +name: Python Package + +on: ['push', 'pull_request', 'workflow_dispatch'] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml + - uses: actions/upload-artifact@v3 + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' + with: + name: coverage-report + path: | + coverage.xml + xunit-result.xml + + sonar-cloud: + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Pull coverage report + uses: actions/download-artifact@v2 + with: + name: coverage-report + path: ${{ github.workspace }} + - name: Adjust coverage source path + run: sed -i "s+$PWD/++g" coverage.xml + - name: SonarCloud Scan + if: env.SONAR_TOKEN != null + uses: sonarsource/sonarcloud-github-action@master + with: + args: > + -Dsonar.projectKey=RECETOX_gc-meox-tms + -Dsonar.organization=recetox + -Dsonar.host.url=https://sonarcloud.io + -Dsonar.python.version="3.10" + -Dsonar.sources=gc_meox_tms/ + -Dsonar.tests=tests/ + -Dsonar.python.coverage.reportPaths=coverage.xml + -Dsonar.python.xunit.reportPath=xunit-result.xml + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..027ec31 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.1] + +### Added +* documented functions with docstrings to make use of Python's built-in `help()` function [#23](https://github.com/RECETOX/gc-meox-tms/pull/23) +* added developer documentation [#23](https://github.com/RECETOX/gc-meox-tms/pull/23) +* added `Publish to PyPi` GitHub Actions workflow [#24](https://github.com/RECETOX/gc-meox-tms/pull/24) + +## [1.0.0] + +### Added +* added **Anaconda build**, **Python Package with pip** (inc. **SonarCloud**), and **Python Package with Conda** GH Actions. +* added test coverage for main and IO functionality +* added conda dev environment and conda meta.yaml recipe +* added CHANGELOG.md + +### Changed +* changed package structure +* divided main functionality, IO handling, and CLI into designated modules + +### Fixed +* fixed not working examples in IPython notebook diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index cdd1af7..0000000 --- a/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM ubuntu:20.04 - -USER root -ENV DEBIAN_FRONTEND=noninteractive -ENV TZ=Europe/Prague - -RUN apt update -RUN apt install -y python3-rdkit -RUN apt install -y python3-notebook -RUN apt install -y python3-pil - -RUN apt install -y python3-pip -RUN pip3 install py3dmol - -RUN apt install -y git -RUN mkdir -p /usr/local/share/jupyter/nbextensions && cd /usr/local/share/jupyter/nbextensions && git clone https://github.com/lambdalisue/jupyter-vim-binding vim_binding && jupyter nbextension enable vim_binding/vim_binding --sys-prefix - -COPY dist/*.tar.gz /tmp -RUN pip3 install /tmp/gc-meox-tms*tar.gz - -ENV HOME=/work diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..32ad627 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 RECETOX + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile deleted file mode 100644 index a4925f2..0000000 --- a/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -image=ljocha/gc-derivatization -port=9000 - -package-build: - python3 -m build - -docker-build: - docker build -t ${image} . - -docker-run: - docker run -p ${port}:${port} -u $(shell id -u) -w /work -v ${PWD}:/work ${image} jupyter notebook --ip 0.0.0.0 --port ${port} - -docker-bash: - docker run -ti -p ${port}:${port} -u $(shell id -u) -w /work -v ${PWD}:/work ${image} bash - - - diff --git a/README-package.md b/README-package.md deleted file mode 100644 index 13cf17a..0000000 --- a/README-package.md +++ /dev/null @@ -1,22 +0,0 @@ -# In silico derivatization - -Package to perform in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010): - -* Metoxymation: ketone R(C=O)R' and aldehyde karboxyl groups are substituted with C=NO[CH3] -* Trimethylsilylation: in -OH, -SH, -NH2, -NHR, =NH, the hydrogen is substituted with -SiMe3 - -The substitution needn't happen always, their probability currently hardcoded in the package. -Typically, multiple substitution attempts are run on each input molecule, and all distinct results are gathered. - -Known limitation is metoxymation on cycles which should be broken. This is not implemented yet. - -Package provides functions: -* `is_derivatized()` checks whether the molecule contains MeOX or TMS groups that are likely to be result of derivatization -* `remove_derivatization_groups()` removes the suspected groups, reconstructing the original molecule -* `add_derivatization_groups()` does the substitution above - -All the functions can accept either `mol: rdkit.Chem.rdchem.Mol` or `smiles: str` argument. All return `rdkit.Chem.rdchem.Mol`. - -The typical useage is wrapped in the `gc-meox-tms.py` driver script. - -See also https://github.com/ljocha/gc-derivatization for example use in Jupyter notebook. diff --git a/README.md b/README.md index f614506..b089a74 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,137 @@ # In silico derivatization -The main functionality is provided as a python package, see [README-package.md](README-package.md) +## Overview -The example Jupyter notebook reads a list of SMILES (text file, one molecule per line), and performs the derivatisation, also inspecting its results. +This package performs in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010): -The final outputs are two files: +* Methoximation: ketone R(C=O)R' and aldehyde (-HC=O) carbonyl groups +are substituted with -C=NOCH3 +* Trimethylsilylation: the acidic hydrogen in -OH, -SH, -COOH, -NH2, -NHR, =NH, the hydrogen is substituted with -Si(CH3)3 +The substitution doesn't always have to happen; its probability is currently hardcoded in the package. +Typically, multiple substitution attempts are run on each input molecule, and all distinct results are gathered. -* `derivs_struct.tsv` with columns (all SMILES): - * original - * with derivatization groups stripped - * column #2 derivatized (multiple times) according to the above rules -* `derivs_flat.txt` -- the above with all the smiles flattened, one per line +Known limitation is methoximation on cycles which should be broken. This is not implemented yet. +## Installation + +There are a few ways to install `gc-meox-tms`: + +1. Install in a new `conda` environment (recommended): +```shell +$ conda create -n gc-meox-tms -c bioconda gc-meox-tms +$ conda activate gc-meox-tms +``` + +2. Install from `pip`: +```shell +$ pip install gc-meox-tms +``` + +3. From source by cloning the repository and installing the package with `pip` as follows: +```shell +$ git clone https://github.com/RECETOX/gc-meox-tms.git + +# install the package: +$ python -m pip install gc-meox-tms + +# if you want to run examples in the Jupyter notebook, install with this command: +$ python -m pip install gc-meox-tms[eda] +``` + +## Usage + +### Command-Line Tool + +`gc-meox-tms` can be used as a command line tool to produce all MeOX/TMS derivatives of given compounds. To use it via +the command line you will need one or more `txt` files with chemical compounds represented as SMILES +(one SMILES per line). The tool can output results in flat `txt` format(one compound per line) or tab separated `tsv` +format (all derivatives of a given molecule per line). +```shell +$ python -m gc_meox_tms \ +-f \ +-t \ + +``` +More parameters can be specified, such as number of cores or repeats. For more information run: +```shell +$ python -m gc_meox_tms --help +``` + +### Python Package + +Package provides functions: +* `is_derivatized()` checks whether the molecule contains MeOX or TMS groups that are likely to be result of derivatization +* `remove_derivatization_groups()` removes the suspected groups, reconstructing the original molecule +* `add_derivatization_groups()` does the substitution above + +```python3 +from gc_meox_tms import add_derivatization_groups, is_derivatized, remove_derivatization_groups +from rdkit.Chem import MolToSmiles + +# Example compounds in SMILES format +compounds = ["CC=O", "CC=NOC", "CCO[Si](C)(C)C"] + +# Check derivatization +[is_derivatized(smiles=smiles) for smiles in compounds] +>>> [False, True, True] + +# Remove derivatization groups from derivatized molecules +underivatized = [remove_derivatization_groups(smiles=smiles) for smiles in compounds[1:]] +print([MolToSmiles(mol) for mol in underivatized]) +>>> ["CC=O", "CCO"] + +# Convert molecules back to derivatized forms +rederivatized = [add_derivatization_groups(mol=mol) for mol in underivatized] +print([MolToSmiles(mol) for mol in rederivatized]) +>>> ['CC=NOC', 'CCO[Si](C)(C)C'] +``` +Note that your results may differ from the presented since `add_derivatization_groups` is not deterministic. If you rerun +the function enough times you will get all possible derivatizations. The number of reruns to obtain all possible conformations +is individual for each compound (depends on possible conversion degrees etc.). + +See also the Jupyter notebook in `example/` directory for more examples. + +## Developer documentation + +### Installation +Create a virtual environment of your choice (e.g., conda or venv). +The development version can be installed with conda or pip as follows: +```shell +# 1. Fork and clone the repository +$ git clone https://github.com//gc-meox-tms.git +$ cd gc-meox-tms + +# 2a. To create a conda env run from the package directory: +$ conda env create -f conda/environment-dev.yaml +$ conda activate gc-meox-tms-dev + +# 2b. Alternatively, install using python venv: +$ python3 -m venv gc-meox-tms-dev +$ source gc-meox-tms-dev/bin/activate +$ pip install -e .[dev] +``` + +### Contributing +Before opening a PR make sure all the tests are passing by running `pytest` from within the package directory: +```shell +$ pytest +``` +It may happen that some tests which are dependent on probabilistic logic may fail. If that occurs, +try rerunning the tests. Usually one rerun is enough. + +We strongly advise you to add new tests for the functionality that you want to contribute. If you want to check whether +your changes are covered with tests, run `$ pytest --cov` and examine the output to see what parts may need better test coverage. + +Run linter, to make sure all is nicely formatted: +```shell +$ flake8 + +# if you use venv, exclude venv directory from linting +$ flake8 --exclude 'gc-meox-tms-dev' +``` + +Lastly make sure the Python imports are in a proper order: +```shell +$ isort gc_meox_tms +``` diff --git a/bin/gc-meox-tms.py b/bin/gc-meox-tms.py deleted file mode 100755 index 2f314f0..0000000 --- a/bin/gc-meox-tms.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import fileinput - -from gc_meox_tms import remove_derivatization_groups,add_derivatization_groups -from concurrent.futures import ProcessPoolExecutor -from rdkit import Chem - -def process_one_mol(n_mol): - mol,n = n_mol - return ( - mol[0], - Chem.MolToSmiles(remove_derivatization_groups(mol[1])), - { Chem.MolToSmiles(add_derivatization_groups(mol[1])) for _ in range(n) } - ) - -def doit(): - p = argparse.ArgumentParser() - p.add_argument('-n','--ncpu',type=int,action='store',help='# of cores to use',default=1) - p.add_argument('-r','--repeat',type=int,action='store',help='# of repeated attempts to derivatize (may return different results)',default=42) - p.add_argument('-k','--keep',action='store_true',help='keep input and stripped derivatization SMILES in output',default=True) - p.add_argument('-f','--flat',type=str,action='store',help='flat output file, one SMILES per line') - p.add_argument('-t','--tsv',type=str,action='store',help='structured output tsv file (original, stripped derivatization, added derivatizations') - p.add_argument('infiles',nargs='+',type=str,action='store',help='input files') - - opt = p.parse_args() - - insmi = list(filter(lambda p: p[1], [(line.rstrip(),Chem.MolFromSmiles(line)) for line in fileinput.input(files=opt.infiles)])) - n_mols = list(zip(insmi, [opt.repeat] * len(insmi))) - - with ProcessPoolExecutor(max_workers=opt.ncpu) as executor: - out = executor.map(process_one_mol,n_mols) - - if opt.flat: - with open(opt.flat,"w") as flat: - if opt.keep: - for orig,removed,added in out: - for one in { orig, removed, *added }: - flat.write(one + "\n") - else: - for orig,removed,added in out: - flat.write("\n".join(added) + "\n") - - if opt.tsv: - with open(opt.tsv,"w") as tsv: - tsv.write("orig\tderiv. removed\tderiv. added ...\n") - for orig,removed,added in out: - tsv.write("\t".join([orig,removed,*added]) + "\n") - - - -if __name__ == '__main__': - doit() diff --git a/conda/environment-build.yaml b/conda/environment-build.yaml new file mode 100644 index 0000000..070f4bc --- /dev/null +++ b/conda/environment-build.yaml @@ -0,0 +1,8 @@ +name: gc-meox-tms-build +channels: + - conda-forge + - defaults +dependencies: + - conda-build + - conda-verify + - python >=3.8 \ No newline at end of file diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml new file mode 100644 index 0000000..9fc8b89 --- /dev/null +++ b/conda/environment-dev.yaml @@ -0,0 +1,11 @@ +name: gc-meox-tms-dev +channels: + - conda-forge + - defaults +dependencies: + - python >=3.8 + - pip + - rdkit + - pip: + - -r ../requirements.txt + - -r ../requirements-dev.txt diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 0000000..b5595a0 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,41 @@ +{% set name = "gc-meox-tms" %} +{% set version = "1.0.1" %} + +package: + name: {{ name|lower }} + version: {{ version }} + +source: + path: .. + +build: + number: 0 + noarch: python + script: {{ PYTHON }} -m pip install . + +requirements: + host: + - python >=3.8 + - pip + run: + - python >=3.8 + - rdkit + +test: + imports: + - gc_meox_tms + +about: + home: https://github.com/RECETOX/{{ name }} + license: MIT + summary: In-silico MeOX/TMS derivatization of chemical compounds + description: This package performs in-silico methoximation (MeOX) and trimethylation (TMS) of chemical compounds + from SMILES strings or RDKit molecule objects. It also can identify whether a given compound is already derivatized + by MeOX or TMS method. + dev_url: https://github.com/RECETOX/{{ name }} + +extra: + recipe-maintainers: + - RECETOX/conda + - hechth + - maximskorik \ No newline at end of file diff --git a/derivatization.ipynb b/derivatization.ipynb deleted file mode 100644 index e9e0ede..0000000 --- a/derivatization.ipynb +++ /dev/null @@ -1,425 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In-silico derivatization\n", - "\n", - "The notebook reads a list of SMILES (text file, one molecule per line), and performs in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010):\n", - "\n", - "* Metoxymation: ketone R(C=O)R' and aldehyde karboxyl groups are substituted with C=NO[CH3]\n", - "* Trimethylsilylation: in -OH, -SH, -NH2, -NHR, =NH, the hydrogen is substituted with -SiMe3\n", - "\n", - "The probability of all the substitutions can be adjusted, they needn't happen always. Multiple substitution attempts are run on each input molecule, and all distinct results are returned.\n", - "\n", - "Known limitation is metoxymation on cycles which should be broken. This is not implemented yet.\n", - "\n", - "The final outputs are two files:\n", - "\n", - "* `derivs_struct.tsv` with columns (all SMILES):\n", - " * original\n", - " * with derivatization groups stripped\n", - " * column #2 derivatized (multiple times) according to the above rules\n", - "* `derivs_flat.txt` -- the above with all the smiles flattened, one per line\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import what we need and setup the environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from rdkit import Chem\n", - "from rdkit.Chem import AllChem\n", - "#from rdkit.Chem.Draw import IPythonConsole\n", - "from copy import deepcopy\n", - "import random\n", - "\n", - "#IPythonConsole.drawOptions.addAtomIndices = True\n", - "#IPythonConsole.molSize = 200,200\n", - "\n", - "random.seed(42)\n", - "\n", - "import multiprocessing\n", - "from concurrent.futures import ProcessPoolExecutor\n", - "cpus = multiprocessing.cpu_count()\n", - "print('# cpus (including HT, typically): ', cpus)\n", - "\n", - "# don't run on HT cores, it just makes congestion\n", - "cpus //= 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import our payload\n", - "\n", - "from gc_meox_tms import is_derivatized, remove_derivatization_groups, add_derivatization_groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Utility function for 3D rendering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import py3Dmol\n", - "\n", - "def draw3d(m,dimensions=(500,300),p=None):\n", - " AllChem.EmbedMultipleConfs(m, clearConfs=True, numConfs=50)\n", - " opt = AllChem.MMFFOptimizeMoleculeConfs(m)\n", - " conf = min(range(len(opt)),key = lambda x: opt[x][1] if opt[x][0] == 0 else float(\"inf\") )\n", - " \n", - " mb = Chem.MolToMolBlock(m,confId=conf)\n", - " if p is None:\n", - " p = py3Dmol.view(width=dimensions[0],height=dimensions[1])\n", - " p.removeAllModels()\n", - " p.addModel(mb,'sdf')\n", - " p.setStyle({'stick':{}})\n", - " p.setBackgroundColor('0xeeeeee')\n", - " p.zoomTo()\n", - " return p.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Simple checks on manual inputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for s in ['CCC(=NOC)C', 'CCC=NOC', 'C=NOC', 'CSi(C)(C)C']:\n", - " print(s,is_derivatized(smiles='CCC(=NOC)C'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "remove_derivatization_groups(smiles='CCC(=N)C')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "m=Chem.MolFromSmiles('CCC=NOC')\n", - "remove_derivatization_groups(m)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "remove_derivatization_groups(smiles='C[Si](C)(C)OCCCO[Si](C)(C)C')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "m=remove_derivatization_groups(smiles='CON=CC(O)C=NOC')\n", - "m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "add_derivatization_groups(m)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read the input file\n", - "\n", - "The file is parsed line by line, errors are reported and ignored otherwise. \n", - "\n", - "The result is `mol[]`, a list of pairs (_original SMILES_, _RDKit molecule_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#smi_file='NIST_Si_100.txt'\n", - "#smi_file='NIST_Si_all.txt'\n", - "#smi_file='NIST_SMILES.txt'\n", - "smi_file='NIST_195_200.txt'\n", - "with open(smi_file) as f:\n", - " mols = list(filter(lambda p: p[1], [ (smi.rstrip(), Chem.MolFromSmiles(smi)) for smi in f ]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Essential statistics\n", - "\n", - "Count occurrences of (one-),di-,tri-methylsilane, TMS attached to -O, -N, -S, and methoximine. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SiMe1=Chem.MolFromSmarts('[Si][CH3]')\n", - "SiMe2=Chem.MolFromSmarts('[Si]([CH3])[CH3]')\n", - "SiMe3=Chem.MolFromSmarts('[Si]([CH3])([CH3])[CH3]')\n", - "ONSSi=Chem.MolFromSmarts('[O,N,S][Si]([CH3])([CH3])[CH3]')\n", - "\n", - "print('# total',len(mols))\n", - "with_sime1 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe1),mols))\n", - "print(\"# with SiMe:\", len(with_sime1))\n", - "with_sime2 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe2),mols))\n", - "print(\"# with SiMe2:\", len(with_sime2))\n", - "with_sime3 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe3),mols))\n", - "print(\"# with SiMe3:\", len(with_sime3))\n", - "with_onssi = list(filter(lambda m: m[1].HasSubstructMatch(ONSSi),mols))\n", - "print(\"# with ONSSi:\", len(with_onssi))\n", - "\n", - "MeOX=Chem.MolFromSmarts('C=NO[CH3]')\n", - "with_meox = list(filter(lambda m: m[1].HasSubstructMatch(MeOX),mols))\n", - "print(\"# with MeOX:\", len(with_meox))\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inspect whatever from the sorted categories" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with_sime2[70][1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "draw3d(with_sime2[70][1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "draw3d(with_onssi[52][1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with_meox[4][1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('NIST_ONSSiMe3.txt','w') as f:\n", - " for m in with_onssi:\n", - " f.write(m[0]+'\\n')\n", - " \n", - "with open('NIST_SiMe3.txt','w') as f:\n", - " for m in with_sime3:\n", - " f.write(m[0]+'\\n')\n", - " \n", - "with open('NIST_MeOX.txt','w') as f:\n", - " for m in with_meox:\n", - " f.write(m[0]+'\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test_smi='CCO[Si](C)(C)C'\n", - "#test_smi='C[Si](C)(C)OCC-N[Si](C)(C)C'\n", - "#test_m = Chem.MolFromSmiles(test_smi)\n", - "test_m = with_onssi[35][1]\n", - "Chem.AddHs(test_m)\n", - "test_m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_n = remove_derivatization_groups(test_m)\n", - "Chem.AddHs(test_n)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_d = add_derivatization_groups(test_n)\n", - "test_d" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "draw3d(test_d)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the in-silico derivatization\n", - "\n", - "Iterate over the `mol[]` list (read from file above), remove derivatization groups from each entry, and try derivatization several times to leverage from the probabilistic behaviour). Assemble the results.\n", - "\n", - "This can be time consuming, expect about 5,000 entries per minute per core. Memory consumption can also grow to several GB." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "def process_one_mol(mol):\n", - " return (\n", - " mol[0],\n", - " Chem.MolToSmiles(remove_derivatization_groups(mol[1])),\n", - " { Chem.MolToSmiles(add_derivatization_groups(mol[1])) for _ in range(42) }\n", - " )\n", - " \n", - "with ProcessPoolExecutor(max_workers=cpus) as executor:\n", - " out = executor.map(process_one_mol,mols)\n", - " \n", - "out = list(out)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Write the main outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('derivs_struct.tsv','w') as tsv:\n", - " tsv.write(\"orig\\tderiv. removed\\tderiv. added ...\\n\")\n", - " for orig,removed,added in out:\n", - " tsv.write(\"\\t\".join([orig,removed,*added]) + \"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('derivs_flat.txt','w') as flat:\n", - " for orig,removed,added in out:\n", - " for one in { orig, removed, *added }:\n", - " flat.write(one + \"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/example/NIST_195_200.txt b/example/NIST_195_200.txt new file mode 100644 index 0000000..4f0e1f4 --- /dev/null +++ b/example/NIST_195_200.txt @@ -0,0 +1,4 @@ +CC(=O)N([Si](C)(C)C)[Si](C)(C)C +CCC(=NOC)C +CCC=NOC +C=NOC \ No newline at end of file diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..dec97c3 --- /dev/null +++ b/example/README.md @@ -0,0 +1,13 @@ +# In silico derivatization example + +The example Jupyter notebook `derivatiozation.ipynb` reads a list of SMILES (text file, one molecule per line), and performs the derivatisation, also inspecting its results. + +The final outputs are two files: + +* `derivs_struct.tsv` with columns (all SMILES): + * original + * with derivatization groups stripped + * column #2 derivatized (multiple times) according to the above rules +* `derivs_flat.txt` -- the above with all the smiles flattened, one per line + + diff --git a/example/derivatization.ipynb b/example/derivatization.ipynb new file mode 100644 index 0000000..2367adc --- /dev/null +++ b/example/derivatization.ipynb @@ -0,0 +1,703 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# In-silico derivatization\n", + "\n", + "The notebook reads a list of SMILES (text file, one molecule per line), and performs in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010):\n", + "\n", + "* Methoximation: ketone R(C=O)R' and aldehyde (-HC=O) karboxyl groups are substituted with -C=NOCH3\n", + "* Trimethylsilylation: the acidic hydrogen in -OH, -SH, -COOH, -NH2, -NHR, =NH, the hydrogen is substituted with -Si(CH3)3\n", + "\n", + "The probability of all the substitutions can be adjusted, they needn't happen always. Multiple substitution attempts are run on each input molecule, and all distinct results are returned.\n", + "\n", + "Known limitation is methoximation on cycles which should be broken. This is not implemented yet.\n", + "\n", + "The final outputs are two files:\n", + "\n", + "* `derivs_struct.tsv` with columns (all SMILES):\n", + " * original\n", + " * with derivatization groups stripped\n", + " * column #2 derivatized (multiple times) according to the above rules\n", + "* `derivs_flat.txt` – the above with all the smiles flattened, one per line\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Import what we need and setup the environment" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# cpus (including HT, typically): 24\n" + ] + } + ], + "source": [ + "from rdkit import Chem\n", + "from rdkit.Chem.Draw.IPythonConsole import ShowMols\n", + "import random\n", + "\n", + "random.seed(42)\n", + "\n", + "import multiprocessing\n", + "from concurrent.futures import ProcessPoolExecutor\n", + "cpus = multiprocessing.cpu_count()\n", + "print('# cpus (including HT, typically): ', cpus)\n", + "\n", + "# don't run on HT cores, it just makes congestion\n", + "cpus //= 2" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# import our payload\n", + "try:\n", + " import py3Dmol\n", + "except ImportError as e:\n", + " !pip install py3dmol\n", + "\n", + "from gc_meox_tms import is_derivatized, remove_derivatization_groups, add_derivatization_groups, process_one_mol\n", + "from plotting import draw3d" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Simple checks on manual inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CCC(=NOC)C True\n", + "CCC=NOC True\n", + "C=NOC False\n", + "C[Si](C)(C)C False\n" + ] + } + ], + "source": [ + "for s in ['CCC(=NOC)C', 'CCC=NOC', 'C=NOC', 'C[Si](C)(C)C']:\n", + " print(s, is_derivatized(smiles=s))" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAIAAAC7/QjhAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3deVRTZ/oH8DcQtsi+KZEISlgEioSlKiKbWBSXyqmUxa2rdjvaQ2trT6el7Zy2Om0tTttpHdvjuLQu9Weto9IpuIKoCLIoioKyFhCVRQQM2/398XbucNgEJblJ3u/n+Ie53PvmSXhyv+HNvTcijuMIAAAAq/SELgAAAEBICEIAAGAaghAAAJiGIAQAAKYhCAEAgGkIQgAAYBqCEAAAmIYgBAAApiEIAQCAaQhCAABgGoIQAACYhiAEAACmIQgBAIBpCEIAAGAaghAAAJiGIAQAAKYhCAEAgGkIQgAAYBqCEAAAmIYgBAAApiEIAQCAaQhCAABgGoIQAACYhiAEAACmIQgBAIBpCEIAAGAaghAAAJiGIAQAAKYhCAEAgGkIQgAAYJpOBWFTU9P27dszMjKELkT3cRz3xx9/CF2FmqCv1IapvgLNoTtBuG7dOjs7uxUrVoSEhPj4+Fy/fl3oinRWbm7uzJkzQ0NDlUql0LWoHPpKbZjqK9AsnPbLz88fP348fTj6+vr0P3p6eikpKV1dXUJXp1P++OOPFStWiEQiQoiDg8PFixeFrkiF0Fdqw1RfgQbS7iC8c+fO6tWr6U5KJBLFxMQolcpDhw7Z2trS3dbkyZN/++03ocvUBUqlMiUlxdzcnBBiYGCwevXq5uZmoYtSFfSV2jDVV6CxtDUIOzo6Nm/eTHdMBgYGK1eurKqq6r3Cvn37Jk6cSHdb8+fPv3HjhlCl6oCDBw9OmjSJfzJLS0uFrkhV0FfqxE5fgYbTyiBMS0vz8vKir5/IyMjBJlLom00zMzNCiKGh4erVq+/evavmUrXdlStX5syZQ59qDw+PI0eOCF2RCqGv1IapvgLNp2VBeO3atfnz59PXj6ur6969ex+4yR9//LFy5Uo9PT1CiFQq3bx5c3d3txpK1XZ0elAsFhNCrK2tU1JSOjs7hS5KVdBXasNUX4G20JogbGxsfPvtt42MjAghpqamycnJ9+/fH/7m2dnZQUFBdE8XEBBw+vRp1ZWq7To7O/npQbFYvHLlyvr6eqGLUhX0ldow1VegXbQgCLu7u7dt2zZ27Fh6zN6yZcvq6uoeYpyenp69e/dOmDCBHgERGxtbUVEx6tVqu/T0dG9vb7pnnzVrVmFhodAVqQr6Sp3Y6SvQRpoehCdOnPD19aWvn9DQ0Ly8vEcc8N69e8nJycbGxoSQMWPGJCcnt7e3j0qp2q6kpCQ2NpY+1XK5fDjTg9oLfaU2TPUVaCnNDcKqqqply5bRU4scHR23bdvW09MzWoNXVlYuW7aMvjhlMtm2bdtGa2RtRHfidHqQ7sRHND2oXdBXasNUX4FW08QgbG1tTU5ONjExIYRIJJLk5OS2tjZV3NGxY8d8fHzobis8PLygoEAV96LJek8PikSiZcuW1dbWCl2UqqCv1IapvgIdoFlBSD9ucXJy4j9uKS8vV+k90lesvb09/0HRzZs3VXqPmuPcuXPTpk2j++vHH3/8zJkzQlekKugrdWKnr0BnaFAQ5uTkzJgxg75+/P39MzIy1HbXDQ0Nb7/9tqGhISHE0tJy/fr1SqVSbfeuftXV1fz04Pjx40d3elDToK/Uhqm+Al2iEUFYU1PDn5Ll4OAg1ClZxcXF8+bNo3tMNze3Q4cOqb8GVWttbV2/fr2pqSkhxMTE5O23325paRG6KFVBX6kNU30FukfgINTAKw2mpaV5enrylxcpKioStp5RdPDgQWdnZ/rQ5s+fX1ZWJnRFqoK+Uid2+gp0lZBBqLFXGuzo6EhJSbGwsOB3o01NTUIX9UjoF9zQp9rPz+/UqVNCV6RC6Cu1YaqvQIcJE4RacaXB27dv819BYGNjo6VfvqMbj2KY0FdqoxuPAoBSdxBq3ZUGL1y4wL/nVSgUJ0+eFLqi4dK9vz+GgL5SG6b6ChihviDU6isN9vkURPO/fCctLW3y5Mk6+YlUH+grdWKnr4ApagrC9PT0xx57jL5+tPRKg21tbX2Oi9PML98pLi6Ojo6mT7W7u/vhw4eFrkiF0Fdqw1RfAWtUHoQ6dqVBTT5TqvdZa1ZWVrp91hr6Sm2Y6itgkwqDUIevNHju3Lnp06fTvXBgYGBWVpaw9dDrmNjZ2ZH/XsdEi6YHRwp9pTZM9RWwTCVBSF8/48aNI7p7pcGenh4NeYxHjx5l5MqW6Ct1YqevAEY/CJm60qCwX75TWlrKTw/q/HcdoK/Udu9M9RUAN7pBqMmfc6iU+j+vYurb79BX6CsAlRqdIMSVBjmOO3r0KH8EY0REhIqmkvpMncXGxlZWVqrijjQB+opDXwGo3igEIa40yKPntNGDC1RxTlt2drZGHUyhUugrHvoKQKUeKQhxpcEBNTQ08Fc5sbKyGpWrnPSeHpRKpbo9PYi+GhD6CkBFHjIIcaXBB7py5crcuXMf/QRkesK1mZmZhp9wPSrQVw+EvgIYdSMOQlxpcEQOHjzo4uJCd1uRkZGXL18e6eYTJ07kpwc1/xJcDw19NSLoK4BRNLIg1OHvVFMduosf6XfjXbhwISQkhD7V2nVR5oeAvnoI6CuA0TLcIMSVBh8R/bZ0Oulna2s7xKQfU9OD6KtHhL4CeHQPDkJcaXAU5ebmBgcH8+/H+xwGQr9XnZ4tIBaLdXt6EH01itBXAI9iqCDElQZVxMTEhPzX2LFjMzIyOI7bsGEDPZGZWrFihdBlqgr6SkUY7yuAhzZoEF69etXb25u+eGbNmnXx4kV1lqXb6KF6a9as4XdPEomE/sfS0vKFF14ghCQlJQldpkqgr1SH5b4CeBR6ZBCOjo53796lVxpMT0/nd14wWj766KMzZ844OTkRQtra2kQi0fz586uqqjw8PIQuTYXQV6rGZl8BPArxYD+QSCSHDx92dXWl33cDqjBt2rTy8vKtW7fW19fHxMS4ubkJXZHKoa/UgMG+AngUgwYhIQTv1tXj2WefFboEtUJfqQdrfQXw0AadGgUAAGABghAAAJiGIAQAAKYhCAEAgGkIQgAAYBqCEAAAmIYgBAAApiEIAQCAaQhCAABgGoIQAACYhiAEAACmIQgBAIBpCEIAAGAaghAAAJiGIAQAAKYhCAEAgGkIQgAAYBqCEAAAmIYgBAAApiEIAQCAaQhCAABgGoIQAACYhiAEAACmIQgBAIBpYqELYJG1tauBQatINPC7EAMDc2trdyMjazVXBdoOfQXwcBCEAmhoyG1pIRw38E87O19saHhRqVRvTaD90FcADwdTowAAwDQEIQAAMA1BCAAATEMQAgAA0xCEAADANAQhAAAwDUEIAABMQxACAADTEIQAAMA0BCEAADANQQgAAExDEAIAANMQhAAAwDQEIQAAMA1BCAAATEMQAgAA0xCEAADANAQhAAAwDUEIAABMQxACAADTEIQAAMA0BCEAADANQQgAAExDEAIAANMQhAAAwDQEIQAAMA1BCAAATEMQAgAA0xCEAADANLHQBbBo1y7S2UkkkoF/umgRcXEhcrl6awLth74CeDgijuOErgEAAEAw+ItQrerryYEDhBCip0dWrCAGBgOv9n//R+7cIdbWZPHiP5eUl5PjxwkhJDSUTJo06Ph5eSQ/nxBCFi0iVlajWTloMvQVwCPhQI3OnuUI+fPf+vWDrubryxHC+fj8b8nevX9utXPnUOP/5S9/rlZYOGo1g+ZDXwE8ChwsI5iPPiJlZUIXAToHfQUwUghCwbS1kVdfFboI0DnoK4CRQhAKIziYEEJSU8n+/UKXAjoEfQXwEBCEwnj+eeLpSQghr71GmpuFrgZ0BfoK4CEgCIWhr0/WryeEkNpa8sEHAhcDOgN9BfAQEISCWbCAzJ1LCCFffUXy8oSuBnQF+gpgpBCEQtq4kRgYkO5u8tJLpKdH6GpAV6CvAEYEQSgkDw/y+uuEEJKdTb77blibPP88MTUd9N+GDSqtF7QD+gpgRBCEAvvgA+LkRAgh77xDamoevL5SSVpbB/3X2anqekE7oK8Ahg+XWBOYREK++IIsXkzu3iXr1pHt2x+w/tq1JCxs0J/u2EF27x7V+kA7oa8Ahg9BKLynniLR0eTIEbJzJ1m58s9TwQYzZQqJjh70p2fOjHp1oK3QVwDDhKlRjbBpEzE2JhxHXn8dRzfAqEFfAQwHglAjyOXkrbcIISQ3l+zaJXQ1oCvQVwDDgSDUFO+8Q1xcCCHkL38hSqXQ1YCuQF8BPBCCUFMYG5NvviGEkPJycuWK0NWArkBfATwQglCDREWRmBihiwCdg74CGBqCULNs2kRMTYUuAnQO+gpgCAhCzSKTkXffFboI0DnoK4Ah4DxCtZLJ/vxyAD+/QddJSiJiMenuJnZ2/1vo60s2biSEEH//ocaPjibW1oQQIpWOQrWgLdBXAI9CxHGc0DUAAAAIBlOjAADANAQhAAAwDUEIAABMQxACAADTEIQAAMA0BCEAADANQQgAAExDEAIAANMQhAAAwDQEIQAAMA1BCAAATEMQAgAA0xCEAADANAQhAAAwDUEIAABMQxACAADTEIQAAMA0BCEAADANQQgAAExDEAIAANMQhAAAwDQEIQAAMA1BCAAATEMQAgAA0xCEAADANAQhAAAwDUEIAABMQxACAADTEIQAAMA0BCEAADANQQgAAEwTC13A/5SVlRUWFnZ2drq4uEyZMkVPb+CQvnz5MiHE09NziKHOnz9vb2/v5OQ00hoSEhKam5uPHDky0g0f6I033jh69Gh+fv6ojJadnT116tTi4mJ3d/dRGVCjNDU1FRYW9l/u7Ow8YcIE9dcjIGNj488///y1114b5vp5eXkWFhaTJk1SaVUAOkYjgrChoeGZZ57597//bWFhIZFIamtrPT09d+zY4efn13/ld999VyQS7d+/f4gB4+Lili9f/sEHH6iqYlCl/Pz8hQsX0v+3tLQYGxsbGBgQQt577721a9cKWpqme/7550NCQlJSUoQuBECbCB+EPT09MTExN27cOH36dFBQECGksrJy+fLlTzzxRGFhoVQq7bP+rl27HjjmxYsX6a4TtFFYWNjdu3cJIV1dXQYGBhs3bnz55ZeFLko7ZGZm6uvrC10FgJYRPgjT0tJOnTqVmppKU5AQMmHChH379k2cODElJeVvf/vb0aNH5XJ5fX398ePHQ0JClEolISQ0NJSuXFVVlZmZWVNT4+zsrFQqFQrF5MmT09LSJk2a5OPjk5GRMXbsWAsLiwMHDrS1tYWFhSkUCrrh7du3f//99+rqaltb2+jo6HHjxg1RZGdn54EDB+bNm5eXl5eVlWVhYbF48WJra+usrKysrCxra+vY2FgzMzO6cnt7e1paWklJiVQqDQsLc3Bw6D9gSUlJRkbGvXv3/Pz8goOD+eW5ubm5ubktLS3h4eF+fn6lpaUFBQVPPfUU/Wl1dfWZM2eefPLJPqPdvXs3LS2toqLCyclpzpw5Y8aMGelvQVvs378/PDz83Llzly5dWrhwYV1dnZ6eHv8Echy3f//+wMDAPjOoFRUVpaWlgYGBP/74I8dxr7zyCiGkrq7uxIkTNTU17u7uUVFRYrG4qqrq8uXLERERv/zyS1lZmVwuX7RoUU9Pz6+//lpWVubu7r5gwQKRSETHvHnz5okTJ6qrq11cXJ544gmJREIIOXv2bHt7e3h4OF2np6dn//79U6dOlclknZ2daWlpxcXF48aNi4yMtLe3H+wxVldXZ2ZmVlVVSaXSnp4efnl3d/fRo0eLiopsbW1nzZpF3yP2eUJKS0vt7e0DAgIKCwsrKyvnz5/Pb3748GEXFxcPD4/+93jv3r1jx46Vlpba2tqamZmZmprOnj27sLBQqVQ6ODjs27dPKpU+/fTThJDr169nZmY2NjZ6enpGRESIxWJCyKFDhyZNmsR/WpGenm5tbe3n55ednW1iYiKRSNLS0vT19cPCwlxdXUf4CwdQC05ob7zxhomJSXd3d5/l0dHRvr6+HMcFBgbOmDHDwsLC399/y5YtixYtiomJoets3brV0NDQ399/yZIlISEhhJDNmzdzHDdx4sTk5GSO4+bMmRMaGiqTyeLi4sLDw/X19X/77TeO49rb2+3s7MLCwpYtW+bv729qalpUVMRxXHx8/Ny5c/sX2dzcTAiJiory9PRcunSpTCZzdXVds2aNl5cXvTl58uT79+9zHFdTUyOXy6VSaVxcnI+Pz5gxY9LT0zmOS0pKmjJlCh3t+++/19fXDw8Pj42NNTAwePfdd+nytWvX6unpRUVFzZs3z9DQ8Lvvvvv73/9uZGTEl/Hzzz8TQhoaGs6dO0cIKS4u5jiOJq5cLl+yZImjo6OXlxetRAd0dnYSQv7xj3/wSyQSycyZM+3s7BQKxcmTJ5OTk21tbTs6OuhPs7KyRCLRjRs3+oyzbds2Gg+enp6RkZEcx506dcrMzEyhUCQkJFhZWdFf+p49eywtLadNmxYREZGQkGBsbLxkyZKgoCD+5iuvvEIHpJt7e3vHx8c7OjpOnDixsrKS47hNmzZJJJKWlhZ+NT09vaqqqqamJoVCMXbs2MTERE9PT1tbW7p+fz/++KORkZFCoViyZEloaKhIJPrqq684jmttbQ0KCrKxsUlISJgyZYqFhcXVq1f7PyEKhWLNmjUcx/3666+EkJKSEjpsZWWlSCQ6ffp0/3ssKCiQSqUTJkyIj4+PiooyNzd/6qmnOI576623vLy87O3tfX19V61axXHc9u3bxWLx9OnTY2Njrayspk+f3traynGcVCr94IMP+AH9/PxefvlljuOWLVvm4eFhZ2cXHx8/bdo0sVi8e/fu4f/2AdRG+CB8+umn3d3d+y9/5ZVXrK2tOY4LDAz08PBobGyky/kgLCsrMzAwSEpK6unpoT8aMAidnZ1ramroCtOmTYuNjaX/b2tro//p7u6Wy+WrV6/mHhSEzz33HA3svLw8Qsizzz5Lb9JDYNLS0jiOe/bZZ6VS6Z07dziO6+rqWrhwoZOTU3d3Nx+Et2/fNjY2fu+99+jImzZtMjExuX37dn5+vp6e3g8//ECXnz17tqmpaThBOH/+fB8fHxp+lZWVhoaGW7duHfGvQSMNGIRz5szp7OykN0tKSkQiUWpqKr25Zs2aoKCg/uNs27aNELJv3z5+iZub24IFC2jnnDp1ihCSmZm5Z88eQsiuXbvoOp9++mnvm+vXrzcyMurq6uI4ztXVlS+jsbHR2dk5ISGB47ibN2+KxWJ+k9deey0sLIzjuHXr1llbW9fW1nIc197eLpPJkpKS+tdZXV1tZGT06quv8i1tZGREg/Djjz82NTUtKyujT4u7u/sLL7zQ/wnhg7Cjo8PGxuaTTz6hyz///PMJEybww/b22GOPPf7443x4R0VF8UEokUguXLhAlzc1NY0ZM4a+TDiOKy0tNTEx+etf/8oNGYQSiYQP46VLl9rY2CiVyv41AAhL+KlRkUjEcVz/5RzH8QeOzpw509LSss8KJ06c6OrqSk5O5merBuTv789PTrq5ud24cYP+38TEpKKi4sqVKy0tLebm5hUVFQ8sdfbs2bQkOsMTERFBb7q5uRFCamtrCSGZmZlxcXHW1taEEH19/Zdeeik6OrqyspIfJDc39/79+/r6+v/85z8JIXfu3Glvb7906VJOTo5EInnmmWfoalOnTn1gPdTp06dnzJhB9/WEEBsbm/Pnz/Pj6J45c+bQGTlCiFwuDwgI2LVr15w5c3p6evbt27du3brBNuTnCW/dunXt2rXg4OAtW7YQQjiOE4vF2dnZ48ePJ4TMnTuXrkZ/rZGRkfxNpVLZ0NDAcVxJScmnn35Ky7C0tExMTNy+fTshxN7eftasWXv27ImPj6fzosnJyYSQ06dPOzk5HTx4kA5lb29//vz5/hWeOnVKqVR++OGH/Vs6KytLJpP9/vvv9KatrS0/Qu8nhGdgYLB48eI9e/a88847hJCff/45MTGx/7C1tbUXL17ct2+fqalp/3rGjx/Pf5SQl5fX2tr60ksv0ZsuLi5RUVEZGRkDPdP/4+/vL5fL6f+feeaZnTt30knmobcCUDPhg9DR0fHgwYMcx/V5lVZUVMhksiE2LC8vt7W1NTc3H/596enp0dBVKpVLly49fPhwQECAnZ1dXV0d3QkOfxxCCJ/fvW/W1dX1/vhn7Nix5L8ZSdXV1RFCKisra2pq6JKVK1eamprW1tba2toOdtLIYO7fv9/Y2NjS0pKbm0uXLFiwwMfHZ0SDaLXExMT333//u+++O3/+fF1dXWxs7AM3ob+C27dv80/ac8895+zsTP8A5fVpSHqT4zj626S/WWrcuHF1dXW0hxMTE1988cWGhobCwsJbt27Rz3fr6uqMjY35u/P39++9Oa+8vNzc3NzGxmbAmnt6evgRvLy8Htj5iYmJmzdvLioqMjc3z87Opm+8+t8jIWTixIlDD0X++6T1edSZmZlDb9X7OaTvZRsaGh54XwBqJnwQzp49+4svvkhPT589eza/sLGxMSMjY+hjBe3t7Zuamjo6OgwNDQkhffZiQ/v+++9TU1OvXLlCzzV88sknB/yr9CHIZLLq6mr+Jv1bsPexG46OjoSQtWvX9jlsITMz8+bNm/zDofT09HofLtGfsbGxnZ1dRETE+++/Pyr1a534+Pg333zzyJEjx44di4iIGDBg+qBvsOLj4xMSEnov37t373DukW5eVVXFL6Fv2uhOPyYm5uWXX/7111/Pnz8fFRVFU00mk1lbW2/evHnoke3t7e/du9fe3m5iYkII6erq4n/7Mpmsra3tgSP0NnPmTGdn571795qamnp4eAz49oi+abt16xa/pKOjY+hHTWc7CCGVlZW0sR/YpVRxcTEhhP8DEUBzCH9lmdmzZ0+fPn3VqlVFRUV0SUNDw9KlS8Vi8Zo1a4bYMCwsrKenh981fPnll8O/05qaGisrK5pJVVVV/F0/usjIyN27d9OJ1vb29pSUFC8vr94ngQQGBlpaWm7YsKG7u5sQwnEcfVc+b9687u7uL774gq6mVCpbW1sdHR07OzuvXbtGCLl///6Ap45ERkb+8MMP9A07fThdXV2j9XA037hx48LDw3/66adffvmlT7ANxtLSMjAwcNOmTffu3aNLysrKhn+P9JDIr7/+urW1lRBSXV29c+fOJ554gv7UzMxswYIFP/300/79++Pj4+nCyMjIQ4cO8VcJaGhoaGpq6j9ySEiIvr7+N998Q29+/fXX/Nu7yMjIY8eOnT17lt5sbm5+4J9WIpEoLi5u9+7ddF50wHWcnJzkcvm3335Lu/H8+fOnT58ecE2FQmFjY/PZZ5/RNXNycn7//Xf6qMePH8+/gs6cOXP16tX+m9fX13/66afBwcF2dnZDlw2gfsL/Rainp3fgwIG4uDhvb29nZ2dTU9Pi4mKZTJaamtr/JMLePD09P/7446SkpO3bt3d0dIzonIGEhIRNmzYFBgba2dldu3Zt6DnYEfnwww9zcnK8vb0VCsX169c5jjt06FDvCSIzM7OdO3cmJiaePHnS2dn5+vXr1tbWFy5ckMvln3322Ztvvrlz505bW9uioqKNGzc+/fTT3t7ewcHBCoWiqKiI/7yqt40bN86dO9fDw8PX17e5ufn69evHjx/39/cfrUek+ZYsWfLcc88ZGhrGxMQMc5OtW7dGR0e7uLh4eXnV1tbeunWLXrFomP71r39FR0fL5XJXV9eCggIPD4+PP/6Y/2liYuKTTz4pkUj4E12SkpKysrKmTp3q5+fX3d196dKlLVu29I9tV1fX9evXr1u3bvfu3V1dXSYmJvS9GiFk1apVGRkZoaGhfn5+IpGosLDwyy+/fPHFF4euMzExccOGDYSQn376acAVxGLxt99+GxMT4+bmJpVKy8vL+XvsQyKR7NixIz4+3tXV1cHB4cKFCwsXLqQnorz++utLly6dOnWqvr5+c3Ozr68vv9WZM2d8fHysrKzy8/OdnJx27NgxdMEAghj4QBVB5OXl5efnd3R0uLm5zZgxg58hzM7ONjMzmzx5Mr1ZUFBACJkyZQq9eePGjQsXLtjY2MyYMcPIyGjLli0vvPBCZmamg4ODi4tLXl6evr4+PylUVFTU3t4eEBBACKmurv7Pf/5jbm4eERHR3t5++/ZtX1/fixcvdnV18QcI8Lq6uk6cOPHYY4/Rmbeenp5jx455eXnRw3D63OQ4Lj8//+rVq1KpNCAggJ5hdu3atYaGhmnTptEBm5qacnJybt68KZfLAwMD+Y8G6+vrCwoK2tvbvb296YWy6FmJTU1NwcHB48ePp3vD9vb2s2fPBgcH08G7u7vz8vKuXbtmbW0dFBQ0os9NNRnHcWlpaZ6envze+fjx43K5vM8bl7a2tgULFpibm//yyy8DjlNbW1tUVMQf3ETdv38/Nze3vLzcwcGBNs/NmzcvXrwYFhZGjz2pr68vLCwMDQ2lF2e4detWQUFBSEgI7cz29vbc3Nzq6mpXV1eFQtF75I6OjlOnTllYWAQGBvYuo6io6NKlSxKJJDAwcIjzVsvLy3NycmxtbYOCgi5dumRnZ8c/3uLi4sLCQhMTE4VCQZ+TPk/I2bNnrayseh+Ncvz4cX19fXpy0WDu3LmTnZ3d0dExc+bMhIQEc3Pzn3/+uaSk5NatW/zZvVRzc3NOTs6dO3e8vb17X+awuLj43Llz9vb2ERER5eXlHMd5eHgsX7780qVLn3zySWNjo5ubm6+vL072B82kQUH4iKqrq2Uy2W+//RYVFSV0LaBWnZ2dDg4O33777XCOlIGheXt7R0VF8VP0j2L58uUVFRUnT5589KEAVEr4qdGH1tHRsWrVqpiYGHd3d3rQuVQq7fMGFliQnp7e0dExb948oQvRSh9++OG4cePoHMyBAwcuX748okNyAHSAFgchPTH5vffeu3HjhrGxcUBAQGpqKn+dM2BHampqXFwcnSWGkXJ0dNy+ffv77yYBuYQAAAA/SURBVL/f2trq7u6+Z8+eGTNmjMrIEokEr0fQCrozNQoAAPAQhD99AgAAQEAIQgAAYBqCEAAAmIYgBAAApv0/Odlyr713K0wAAACNelRYdHJka2l0UEtMIHJka2l0IDIwMjIuMDkuMQAAeJx7v2/tPQYg4GWAAEYgZgViFiBuYGRjSACJMUNoJiY2Bg0gzczCzpABopngCriBGhmZGJiYOZiYWBhEGMTdoGaBAWsjK+uBtDQ1NRDHqcN9PwODgz2IDRS3P3vmzBIYm4HhwH4Qe9HJF/YwNWIAJCwUSUO7KBUAAADJelRYdE1PTCByZGtpdCAyMDIyLjA5LjEAAHicfVDBDsIgDL3zFf0BSQt0G8dtLMaYsUSn/+Dd/49Fg7DD1nLoK6+PPhSkuIXr6w3/MEEpADw43nt4WkRUM6QChul8iTCu/ZA74/KI6x0YnExIbpn9usy5QzDCyWhsO9cQoMZvVEUmmkRE3bbeGSv3Le8QrRBRM/mGOE3sKjqIFZE07xFZFEl31DV8+PIUw8bbz+2wxFDcpjTFkwCwZXMB4Mp+JJBr9Vor4fznUqsP+yFXiUV2W18AAABeelRYdFNNSUxFUyByZGtpdCAyMDIyLjA5LjEAAHicXck7DsAgCADQq3TUBAiofIzpxN4TeXjbtdtLXmaWrPdz7YKN2GMYMCxkcp+jdXihsJhUpol+JxQSpv9BIYW6D7OzEKnAW8mEAAAAjnpUWHRyZGtpdFBLTDEgcmRraXQgMjAyMi4wOS4xAAB4nHu/b+09BiDgZYAARiBmBWIWIG5gZGNIAIkxQ2gmJjYGDSDNzMLOkAGimeAKuIEaGZkYmJg5mJhYGEQYxN2gZoEBayMr64G0NDU1EMepw30/A4ODPYgNFLc/e+bMEhibgeHAfhB70ckX9jA1YgAkLBRJlBdGwQAAAMp6VFh0TU9MMSByZGtpdCAyMDIyLjA5LjEAAHicfVDBDsIgDL3zFf0BSQt0G8dtLMaYsUSn/+Dd/49Fg7DD1nLoK6+PPhSkuIXr6w3/MEEpADw43nt4WkRUM6QChul8iTCu/ZA74/KI6x0YnExIbpn9usy5QzDCyWhsO9cQoMZvVEUmmkRE3bbeGSv3Le8QrRBRM/mGOE3sKjqIFZE07xFZFEl31DV8+PIUw8bbz+2wxFDcpjTFkwCwZXMB4Mp+JJBr9Vor4fznUqsP+yFXiQraJGEAAABfelRYdFNNSUxFUzEgcmRraXQgMjAyMi4wOS4xAAB4nF3JOw7AIAgA0Kt01AQIqHyM6cTeE3l427XbS15mlqz3c+2CjdhjGDAsZHKfo3V4obCYVKaJficUEqb/QSGFug+zsxCpESqffAAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'CCC(=N)C'\n", + "ShowMols(\n", + " [\n", + " mol := Chem.MolFromSmiles(smiles),\n", + " remove_derivatization_groups(mol=mol)\n", + " ],\n", + " legends=['Original molecule', 'Try remove deriv group'])" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAIAAAC7/QjhAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3daUAUV7oG4NPNvm+CbAKyRYyiiAiKikILUZEYFUQIo0lu2l1jNGISIzHRazua2KMXR0NMwtVRAjoazI07IogSRaOAiMgmoIAIssre5/44TIcgICLQYL3PL6q6qvrrhXqrTp1TzaOUEgAAAK7iy7oAAAAAWUIQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1B2CsOHybffUdqav4y848/yHffkWfPZFQTAAC0p78EYXNzM6VU1lX0mE8/JYsXk82b/zLz1CmyeDGprJRRTQAA0B7ZB6FEIlm9erWqqqqcnNzIkSOLiopkXVHPUFYmu3aR5GRZ1wEAAJ2SZRBSSo8ePWpvb7979+6GhgZKaWpqqr29/ZkzZ2RY1cuqribbtpFly9rOnzGDmJuTJUuIRCKLsgAAoGtkFoTnz593cnLy9fW9c+eOoaHhwoULDx8+bGhoWFJS8tZbb/n5+T158kRWtXXRs2fkm2+IpSX57DOyfz/JzPzLo4qK5O9/J1evku+/l1F90Feampp27Njxww8/1NXVyboWAHh5tM+dO3fOycmJPbupqalYLK6rq2MPNTY2isViNTU1QsjgwYPDw8P7vryuqK+n+/dTY2NKCCWEurjQ6Oi/LGBmRv39KaXUw4Pq6NDiYkop3bqVEkILC2VQMPSSioqKvXv3Kioqsu+zhoZGYmKirIvqCTdv0q1b6dKldPVqum9fyzcY4DXVp0EYHx8/ZcoUtsvQ19cXiUTPnj17frGsrCwPDw+22MyZM/Pz8/uyyM41NNDwcDp0aEsE2tvTyMh2FpMG4d27VFGRBgVRiiB8vaSkpAiFQnbQRgiRl5fn8/mEEF1d3VOnTsm6uldQX08XLqSE0EGD6JQp1NmZKipSdXV68KCsKwPoLX0UhAkJCdJsGzRokEgkqqmp6WR5iUQSHh6uq6tLCNHS0hKLxaxbqQw1N9PISGpt3RKBI0bQyEgqkbS/sDQIKaUbNlBC6KVLCMLXQXNzc3R0tEAg4PF47Pvs6ur6448/1tfXp6enu7i4EEJ4PN6qVavYZe+BZ+VKSgj9+msqrf/hQzppEuXz6aVLMq0MoLf0ehAmJiZ6e3uzXYaurm5ISEhFRUUX1y0sLJwzZw5bd9KkSenp6b1aakdYBNratkSgnR0ND6ft5nJVFX3wgNK/BmFNDTU3p46OdMsWBOEAVl5eLhaLLSwspK2gQqEwJSWl9TISiUQsFisoKBBCJk+eXFBQIKtquyk/n8rL03nz2s5//JhqatLJk2VRE0Cv68UgTE5O9vX1ZQfO6urqwcHBT58+7cZ2oqOjjY2NCSEqKioikaipqanHS+2IREKjo+moUS0ROHQo3b+ftvv8NTVULKaGhtTTk9K/BiGl9MSJlpNIFoQlJfTq1T56Cb3k999/P3DggKyr6CM3b94UCoWqqqosAq2trUUiUVlZWUfLX7p0iX1jBw0aNMCaScPCKCH0xIl2HvL3p3w+LS/v85oAel2vBGFqaqo0AtXU1IKDg0tLS19lg0+fPhUKhWw3NHr06Bs3bvRUqR2RSCTHjx9fsOAqi0BLS/rTT7SxsZ0la2tbIpAt6epKq6raBiGl1Nu7ZYHCQrpgAeXxqFBIq6p6+3X0vJSUFPbhKisrD7wznpfR1NTEWkHZF4/P5wsEgsjIyK4cirHOzwOjmfTu3T/bPNeto4TQ5OR2Ftu4kRJCr1/vy9IA+kYPB+Hdu3eDgoLk5OQIIaqqqqtWrSrsuabAU6dOmZubs44JwcHB0r6mPe7cuXNjx44lhGhqDhkxomn/ftrufqyhge7fT01N2+k483wQ5uZSVVVKCH34kH76KZWXbznFPHu2l15Ez0tNTZ07d670+GbDhg3dO8Xv/4qLi0UikZmZGYtATU1NoVCYlpb2Uhtp00z68OHDXqq2m5qb6blz1Nub8njUzq7lcveyZS3f0eeJxZQQGhvbx2UC9IEeC8KcnByhUCgvL08IUVRUFAqFvfGfX1NTExwczILW2tr64sWLPbv9+Ph4Nzc3tvszMDAQiUS1tbXPL8auGlpZddhxZu9e+ssvbdeKjqYiEa2uppTS27epk1PL6r6+9MmTnn0dPSw7O1soFLK3nX24PXh8068kJSUJhUIVFRX2HbC1tRWLxVWvcOYeGxvLmkn19fVPnz7dg6V2X1kZ3bGDWlq2fP/U1OjixS3fS9azq90Wl+BgSgi9dauPiwXoAz0QhA8ePJBGoIKCQlBQUHZ29qtvthMJCQl2dnas3UkoFFZWVr76Ni9fvuzu7t66X2u7QzvadJwZPrzDjjMv1NhIxWKqpkYJoYMH0/45ZjI3N7cPjm9krr6+PjIy0tXVtXUraHR0tKSjbsEv4/Hjx15eXuzrGhwc3JcXudtKT6erVrV851iLv0j0l6OwQ4coITQiop11Z8+mioq0veNCgIHulYIwPz9/1apVysrKbN/h6+ubmZnZU5V1rqGhQSQSsYHMFhYWr3KsffXqVWm/Vj09vY76tXa948xLycyk7u4t2/T2pv1nzGReXt6qVauUlJSkxzdZWVmyLqrnFRYWikQiExMT9gXQ0tJatWpVTk5Ozz6LRCIRiUTslNrNza2vDyaam2l0NBUIKI/356XsyMh2vr6lpVRFhbq7t52fk0OVlemcOX1TL0Af62YQPn78ODg4mLUgsQi8d+9ez1bWFcnJydKb1Pj6+paUlLzU6rdv3/b19ZX2hg8ODi7voFPcuXPUwaFlH2JuTvfvb7/jTPdIJDQ8nOroUEKolhbdv7/D4Yl9o7i4ODg4uPXxzf3792VZUO9ISkoKCgpi1/AIIcOGDROLxZ0Pb31FFy9eNDIyYs2kZ86c6b0n+tPTp1QspubmLd9dDQ0qFNLU1M5WYaN8liyh0sPB5GQ6YgRVUaEveZUUYKB46SB88uRJcHAw60rO4/G8vb1vyfSyQffuyta6Xysb2tFRb/gzZ7LHjGnZjZiZ0e++a7/jzKvLz/+zZ6lAQKtyXi7Ue0RJSUl/OL7pVXV1dZGRkWzkO3uZ3t7e586d65tnf/z4saenZ180k968SYXClg5ahFBrayoS0Y6HfNCbN+nSpbSmhkokdONGqqBAlZTom29SCwtKCDU1pT19PR6g/3iJIKysrBSJRFpaWmwPIhAI+mAYQxe1viubt7d353dlE4vF0q6P69ev7+g8Mj4+fvLkyWpqBvr6zQYGVCTqi+sjkZFUX5+udbhAVVWpSNQDba9dVFJC16//bNo0to9mN0Pvo6fuKw8fPgwJCRk0aJC0M1RwcPADdgeEPtS6mXTKlCmPHj3qwY03NDQcOXJkX2BgS/7x+dTbm54502EjQ1NTS6spWz4srGV+fj4NC6ObNtEtW2h0NC4NwuutS0FYVVUlEom0tbWlEXi9/w0n6vpd2e7du6eqqioUCjvaB8XGxk6aNIm92MGDB4eFJbXXb6a3lJTQqqWftOyYnJ3pX+9d0vPKyujGjVRDgxLyRFfX39dXtqf4vSE+Pt7X15d1+SGEODo67t+/v93OUH0mJiaGNZMaGBic7YkxNEVFRSKRaMiQIYQQPo9XO2IEFQo7a8wsKqJfffXnneO1temaNfR1vAwM8EIvCMLq6mqxWGxgYMD2IK6urrH9eyDRo0ePunJXto5u89bFjjN94bffqJkZJYQqKNDgYNobgyarqqhI1HJxkjXIXrvW888iO7W1teHh4fb29uwDVVRU9PX17bNW0BcqLi6eNm0aIUROTi4kJKTbzaRXr14NCAiQ/gLGiBEj9u3bV82GQ7QrKYkKhVRFpeVzt7WlYvGAvLkDQA/pMAjr6ur2799vaGgojcALFy70ZWWvoht3Zet6x5m+U1NDg4Mpn08JoW++2ZO3ZauupmIxHTz4zz6E/fv45mVlZWV9/PHH0jYMIyOjzZs398Oxj01NTSEhIexnK6ZOnfpSFbIhHxMmTOjqkI/6ehoZSV1d/2w1FQhodLSMu2YB9AMdBmFubi47xnRxcYlu82t7A0FZWVkX78omvWfYCzvOyMbly9TOrmXP9eq3ZWM/pWhk1LI3nDCBnj/fQ4X2I//zP/8jbQUNDw/v13c4ozQmJoYdcRoYGHTlhPXRo0cvN+SjsJCKRNTEpOVD19Kiq1bRnh4iAjBwddY0unPnzt9++63PSukNnd+VLS0tLSgoiB2Pq6mprVq1qqioSFaldqa2loaEUEVFSgi1sKDd63nPIlC6N3z+14RfI5WVlUKhsP905nqh4uJidlNT1kzayY+OrVixQnqx08HB4cCBA+3e/KhFXBz186MKCi0f+ujR9PvvqUwvjgL0QzL4hfo+1uaubGFhYbS9e4b1bOe9XtHRbdnq6+kPP9B336UCAfXxoZs20TZ39mG/Jiy9pdbIkZ39lCLISOtmUnd3946aSTdv3iwnJ9f5kI+6urrw8PDRo0eXsdGvcnLU25v2m4ujAP3N6x+EzOXLl4cNG8aOo3V0dNgxtZKS0ooVKwbSPcOkt2VTVaXsJj4PHtDhwymPRydNoh9+SOfNo9raVFHxzzu2/e///hmB9vb0+HFEYH924cIF1kxqYmISFxf3/AKlpaWdfGNzc3PXr1+vp6fHvuo7PDzoxo30tf6REIBXx5UgpJQ2NDSwWz6yBqgBfM+we/doVBSllDY303HjqLr6X7q6lJbSiROpvDxNSqKU0vff//PXhGV4l0vosoKCAjZ6R15evvNm0tb64RARgIGCQ0HIHD9+/Ouvv+7ng0C66tQpSgjdvr3t/Lw8qqDQ8jvjubn04EFE4MDS2NgobSb18PDo5NI1GyIycuTI/jlEBGBA4FFKCQxQa9YQsZhkZBAbm7YPTZ5MkpNJWRnh82VRGfSACxcuBAYGFhcXm5qaHjlyZOLEia0fzcrKCgsLCwsLKysrI4QYGhouXLhwxYoVpqamMqoXYKDCXnIgy84mPB75z+/H/oWFBamoIGVlfV4T9BgPD4+kpKSJEycWFBRMnTp1yZIlrJn0/Pnzfn5+b7zxxvbt28vKytgQkby8PJFIhBQE6AacEQ5k06eT2FhSW9vOQytWkNBQUlBA/jPaDAaopqamzz77bOfOnZRSdXV1AwOD7OxsQoiysvKCBQtWrlzp4OAg6xoBBjZ5WRcAr0BHh9TVkdJS8p9egn8qKCA8HtHRkUVZ0JPk5eX//ve/a2pqbtq0qbq6urq62sjISCgUrlixQnoDcQB4FTgjHMh27CDr15NLl8jkyW0fsrYmiookLU0WZUGvSExMPHr0qKqqakhICBsCCwA9AkE4kD14QKytycyZ5MSJv8w/cYK88w7ZsoV8/rmMKgMAGDDQWWYgMzcnGzaQX34hixaRBw8IIeTZM/Ljj+RvfyNvvEE++kjW9QEADAA4IxzgKCUiEdm6ldTUEDU1UlNDCCHTp5MDB4iRkayLAwAYABCEr4XqapKQQAoLiYYGGTOGDB0q64IAAAYMBCEAAHAarhECAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACcJvfll1/KugZ4DRUUFFy7di07Ozs7O7u0tFRDQ0NJSellN3L16lU+n6+pqfnq9dy6dau6ulpXV/cVt5OdnZ2Tk2NkZPTqJcFLUVRU1NPTc3JyknUh8BrCGSH0iujo6GnTpvn5+fn5+bm4uOjp6fn7+5eWlr7URry9vSMjI7tXQGZmZuuDvMWLF+/atasb26GULl26tKamhk2Ghoa+99573SsJAPonBGH/lZGRERUV1djYKOtCuq+kpKSsrKyysvLXX3+9cuWKl5dXQ0ND11d/8ODBihUruvfUycnJrUP04sWL3377bTe2U1JSsm/fvrq6Ojb53//931evXu1eSQDQP8nLuoDuqK+vDwkJYX/z+Xw9PT1nZ2dXV1cej/eym9q1a5e5ufmcOXN6qrbQ0FB29vPqmzp58uS6desqKioUFBRefWsypKam5uXldfToUWdn5yNHjixcuJAQUldXd/bs2czMTBMTE09PTx0dnfr6+ujo6FmzZp08eTI/P9/f3//y5csjR460s7NLTEx89uyZu7s72yCl9NixY+PGjTMzM0tLS0tISKioqLCwsPDx8VFUVCSE/Prrr6dPn66srIyKiiKEeHl5JSQk6OnpjRs3Ljc39/r1663LmzlzpqqqalFR0fnz5x8+fGhoaOjt7a2np0cISUxMvHLlCiHkl19+0dDQGDNmTHl5eVFR0cyZM9m6ubm5cXFxpaWldnZ2Hh4e7JM6derUqFGj6uvrT548yePxpk+fbm1t3dGbU19fHxMTk56erqWlNWjQIEKIj49PRkZGcXHxsGHDIiIi1NTU3n//fUJIQUFBXFxcUVGRjY2Np6cna2q+cOGCtra2o6Mj21pCQgIhxNXVNSUlpba21sTE5MyZMw0NDRMmTLC3t2+3gF9//dXJyenu3bvXrl2bPn36yJEjKaVxcXHJyclqampTp04dOnQoe1H29vbV1dW//fabnJzcrFmzhg4deu/evVOnTikpKXl7ew8ZMkT66cTHxycnJyspKY0fP37EiBGEkIqKirNnz7q5uRkYGLDFUlNTHz165OnpSQjJzMyMi4urrq52cHCYNGmStLaCgoLLly/n5+cbGxtTSrv4fQN4aXQAqqioIIRYWVkJBILJkydbWVkRQlxcXPLz8192U+7u7p988smrFLN169YdO3ZIJ729vZcvX/4qG5TauXMnIaSioqJHttbHQkNDCSFNTU2tZ1pZWc2fP59SWlRU9MYbb5iYmAQGBlpbWw8ZMqS0tLS4uJgQMmnSJBMTk1GjRmVmZurq6n777beU0j179igrK0vfioSEBB6Pl5OTExMTo62tPWvWrKCgIGNj4zFjxjQ0NFBK/f39zczM1NTUBAKBQCDIzc0dN27csmXLKKVnzpwR/MfgwYPV1NTKysrKy8u1tLQ8PDyCgoLs7e11dHRycnIopVu2bGH54ebmJhAITp48+fHHH9vb27MyIiIiFBQUnJ2d/fz8Bg0a5OjoyCq0srJ66623zMzMAgMDHR0dVVRU0tLS2n2XcnJybGxsBg8ePH/+/JkzZ+rp6bm6ulJKv/32WwsLCwsLi5EjR86ZM4dSeurUKRUVFQcHh/nz5xsaGtrZ2T1+/JhS6uTkJBQKpRucNWuWj48PpXTDhg1Dhw41MDCYN2+eh4cHj8fbtWtXuzWYmppOmjRJR0dnzJgx0dHRTU1NPj4+6urq8+fPd3FxUVJSunLlCqXUxsbG3d3dzMzs3Xfftbe319DQ+Oqrr8zNzdmkpqZmbm4upZStrqqqOmfOHIFAwOfzt23bRiltbGw0MDDYvHmz9HmnTJmyZMkSSumBAwfk5OSmTp3q6+uroKDw6aefsgUOHjyopKTk4OAQGBg4efJkHo8XGhra5S8gwEsYwEG4ZcsW6ZzY2Fh9fX17e/v6+vo+LmbWrFmrV6/ujS2/fkEoEAjGjx9PKRUKhaampk+fPqWUlpeX6+rqbtmyhQXhBx98IF1eGoSPHz9WUFA4dOgQm7969epJkyZRSpubm+vq6tjMtLQ0QsiZM2fY5Jo1a+zs7KSbkgah1P3799XV1b/77js2+ezZM/ZHfX29kZHRF198wSaPHj1KCHny5AmblAZhdXW1lpbWhx9+yOY/ePBAQ0Pjs88+o5RaWVm5uLhUVlZSShsaGgwNDaU79zamTZtmY2Mj3fiiRYukQcjn88+fP8/mNzY2Ghsb+/r6Njc3s3dDX19/6dKltNMg5PP5V69eZfODg4MVFRVZdrZhamo6YcKE2tpaNvnTTz/JycndvHmTUiqRSCZPnjx9+nRKqY2NjYuLS1VVFaW0pqZGQ0PD2dlZOqmpqSkSiSil4eHhPB6PZSeldOfOnXw+PyMjg1K6dOnSYcOGsfmFhYVycnKXLl0qLS1VUVH5/PPP2fzdu3crKyuXlJTk5+crKiquXLlSIpGwhxQUFBCE0EsGZNPo89zc3EJDQ/38/KKiogIDAwkhEokkOjr6+vXrPB5vypQpAoGAEHL37t2UlJSZM2f+8MMPJSUl//Vf/3X58mUjI6OpU6dmZWVduHBh9uzZ0qabwsLCkydPvvPOO/r6+jdu3IiPjy8uLjY2Np49ezZrBbp3796lS5fu3LlTU1Pz3XffEUL+9re/nTx5UktLy9PTs7KyMiIionWRgwcPfvvttwkhzc3Nv/zyS1JSEp/Pd3d3l7b4EUIqKiqOHTuWkZGhra19+/btvnr/+gillDVfJyQkWFhYSK/hGRgYXL9+/cMPPySEeHt7P7+ivr6+QCD4+eefAwMDJRLJ0aNHP//8c0IIn89XVFT8448/8vPz6+rqeDzegwcPulJJY2NjYGCgQCBgT0oIUVFRyc7OTk9Pr6mp0dLSeuF2UlNTKyoqlixZwibNzMy8vb3j4+PZpJubm4aGBiFEQUFh6NChjx49areG2NjYb7/9ljXDtqGqqurh4cH+zs7OfvTo0eLFi/l8Pns3fH194+LiOq/Q3NzcxcWF/f3ee+9t3749OTlZus3Wpk2bpqyszP6+cuUK+zhYA7KmpmZiYiJ7aNKkSerq6qw2ExOT8ePHSydNTU0LCwsJIQkJCaNHjx4/fjxbZenSpZ988klCQoKNjU1AQMA///nP5ORke3v7o0ePGhsbT5w4MSYmpra2Vl5env0HPXnypK6uLiUl5dGjRw0NDV9++WU3rncAvKzXJAgJIbNnz1ZQUDh9+nRgYGB9ff2MGTMuX77s6en57NmzrVu3fvnllyEhIXFxcevWrbOzs3v69CkhJDAw8Msvv3R1dZ06daqqquqyZcuqqqrWrl3LNvj999/v2rVr4cKFx48fDwwMdHV11dPTO3bsWHBwcFxc3NixY3Nzc6OiovLy8hobG9m1qPnz52/bts3KysrT07OiooL9bxNC6uvrU1NTZ8+e/fbbb9fV1b311ltXr1718vKqqqraunXr119/vXHjRkLItWvXvL29KaUTJkxoaGi4deuWjN7L3pKTk+Ps7EwIKSwsVFVVvXHjBps/efJkS0vLztcNCAh4//33S0tL09LSiouL586dSwi5f//+22+/XVpaOmrUKG1tbUJIc3NzVyrZuHFjQUHB//3f/7HJmpoaPz+/2NhYJyenQYMGlZaWSiSSzrfA9vuDBw+WzjE0NLx27drzS7L0el5BQUFjYyO7AteN52o3XFtrHSHszSkrK3vhcxUVFcnLy0s/GmNj47lz5z7/brR5UdLJwsJC6aEkIURVVVVTU5OV6urqOnTo0J9//tne3j4yMtLf35/P5xcVFRFC8vLy2GskhAiFQnV19dzcXG1t7Vcf7gLQFa9PECooKJiamubm5hJCdu/eHRsbGxMT4+bmRghZvXr1jh07WP/D6urqBQsWrFmzps3qRkZGU6ZMiYiIkAZhVFTU3LlzWUeAoqIiNpqtvr7eyspKLBYfOnTIy8vLy8tLR0dnzpw5YrG4zQaHDBmSlJTE/v7oo4/y8/NZ932xWBwfH3/p0qWJEycSQpYvX759+/bly5erqan5+voaGBhcvHhRX1+fEPLNN9+sW7eut96vPvf7779nZ2dv3ryZEGJmZjZmzJh9+/a1XuDx48edrP7OO+8sWbLk+PHjt2/fnjZtGtvbrl69Wk9P79atW6yPjJycXFcqiYuL++abb06ePMk6pxBC/vGPf/z+++9ZWVmGhoaEkClTprxwI6xVID8/38TEhM3Jy8szMzPrSgGMgYEBj8crKSmRzumoSy3bbH5+Put4Qgh58OCBubk5IYTH43Ul+9PT0wkhnfTZkRoyZEhWVtb+/fu78AraX/3SpUvSyfLy8srKSlY/j8fz9/ePiIhYvnx5QkLCP/7xD0KIqakpIWTdunXDhw9vvZ1bt25VVlbW1dWxU1XWxt69kgBe6LUaPqGkpNTU1EQIiYmJsf5DG38AAAgASURBVLS0VFdXv3Hjxo0bN+zs7Gpqav744w+22Pz589tdPSAgICkp6f79+4SQ9PT0lJSUBQsWEEIUFBQ0NTWbmpoePnx4584dtqfoelVnz57dvXv33r17LSwsWG02NjYqKiqstuHDh1dXV9+6dSsuLi4vL++LL75gKfg6aWhoiImJmT9/vpOTE3vzBQJBRESE9G18/PixdKBeR9TU1Hx8fI4cOXLs2DFpp9zCwsJhw4axFGRXB6XL6+npFRcX19fXt9lOeXl5UFDQ6tWrp0+fLp356NEjAwMDdsqVmZmZmZkpfYiFZX5+fpvtjBgxwtDQ8JtvvmFfuZSUlF9//ZX1gewiNTU1FxeXsLAwNjbj3r170jPUNszNzW1sbHbt2sVeTlZWVlRU1LRp0wghpqamrCcOIeTu3bu///7786tXVVWFhITY2NiMHDnyhVUJBII7d+5ER0ezydraWnbS1kXTpk27c+fOyZMn2eS2bdsUFRWlBxYBAQHZ2dnr16+3tbV1cHAghIwdO1ZHR2f79u0szimlOTk5hBA3Nzc+n79371624u7du9n7DNAbXp8zQkppXl6enZ0dISQ3N7ewsNDPz0/6qKWlZWVlZedbmDdv3ooVK37++eeNGzdGREQYGRmxE8qSkpL169dHRETU1dXp6OhUV1d31BP9eU+ePFm0aFFAQEBAQACb01Ft7MxAesj/emChXl1d3dzcPHfu3H379rEBBiEhIUlJSfb29o6OjrW1tXfv3o2KipKOAehIQEDArFmzlJWVZ8+ezeZ88MEH7Gy7vLycUspOL5j58+fv3LnTzs5OTk5OulsnhIhEory8vLNnz44dO5bN2bNnT1BQ0IEDB1xcXDQ0NHJzc1uf2Lm4uIwaNUogEJiYmLRuSFBSUjp06NC8efOsra1NTU1v3rwpEAieb2noXGhoqI+Pj42NjZWVVXp6euv6W+Pz+QcPHvTx8bGysrK0tLx58+bYsWO/+OILQsjy5ctnzJgxatQoXV3dvLw8li5Mbm6unZ2doaFhamqqurr6iRMn5OVf/P8+e/bslStXzpkzZ8yYMYqKiikpKWvXrt20aVMXX9Hs2bOXLVvGVq+urs7Ozt63b5/0dY0YMcLe3v5f//rXV199xeaoq6sfOnRowYIFcXFxQ4cOzcrK0tbWvnXrlq2t7bZt24KDgw8fPtzU1KSmpiY98wboca9PEF6/fv3Zs2dsEJKenp6xsfGFCxfaLNN5g4+mpub06dMPHz68cePGqKgof39/1tQmFAoTExNPnTo1ceJEeXn5WbNmSa9nvNAHH3wgLy+/Z88e6Rw9PT0LC4uzZ8+2WZL1Tnz+DGaA8vHxsbW1ZX+rqKgMGzasdZcQdXX1mJiY27dvp6ena2hosFvPNDQ0nDt3btSoUdLFjh071vraoZeX17lz5zQ1NaU3XVuxYoWzs/ONGzcsLS3d3d1TU1OlrZ3W1tYZGRkJCQk6OjrW1tZ79uxhay1atIj1nJKytbXV09NLS0u7cOGCrq6uu7t7eXl5dXU1e1RJSSkxMfHixYuU0okTJ06YMGHevHnsIQ8Pj7y8vOvXrz958mT48OHSg5jw8PDW1/PEYrGKikq775KDg0NaWlpiYmJlZeXEiRODg4PZWfLcuXNHjx7deklnZ+fs7OykpCQ28mTUqFHsEqC7u/vdu3evXLmirq4uEAhKSkpYn2pCiJGR0Z49e4qLi62trR0cHNh58/MOHz7cJmN27969Zs2amzdv8vl8e3t7Njzpp59+at1W8f3337f+QMPCwnR0dNjfoaGha9eu/eOPP9TU1BwdHdu0cPz8888FBQWtA3vGjBnsbSwuLraysho3bhx7aevWrZs3b15SUpK+vv748eNTU1NbX30E6Emy6q76Kp4fPvH06dOxY8eybg6U0uXLl6urqxcXF7dZkV2UevjwoXSOjY3NokWLpJPHjh0jhBw+fJgQcu3aNTbTwMCAdVVn3NzcHB0dpZOGhoaLFy+WTjo4OMybN4/9vXfvXj6fz/ahUosXL9bQ0CgpKWlTW2pqKiGk9ZBENsx/gA6fgG7w8PBYsGBBj2xqw4YNlpaWPbIpgNfeAD4jjI+P3759O6U0Ly/v3//+97Nnz06cOMG6mX388ceHDh3y9PRcv369lpbWnTt3bt++/a9//euF25wxY4a2tvbKlSutra2lt/e1s7M7ffr06dOnm5ubf/zxxytXrrRuGnVycvr3v/89fvz4ysrKZcuWSednZGSsW7fO0dExIyMjIyODEKKgoPDee++tXbv2yJEjnp6e69at09TUTE1NvXPnzsGDB998800vLy/WX3zYsGEnTpzo6IoRvDbEYjEb3qOqqhobG3vx4sVu31sVALptQAahnJyco6PjkydP2KAFXV1doVC4ePFiaQuPpaVlQkJCSEjI+vXr5eTkbG1tAwICJBKJvr6+o6Nj6zuWDR8+nPW+Y5SVlZcvX3769OmgoCDpzAMHDixZsmTu3Lm6urpLly6dOXNmTEyM9NHQ0NCPPvro66+/NjMz8/f3HzZsGLsicv/+fTs7O4lEIh1Eoaqq+t5779nY2LDaPvnkE3l5eVtb23fffVcikfD5/KNHj27atOnQoUPFxcUCgSA6Ovqzzz7rYk9IGIhMTEzCwsK2b9/+9OlTGxubf/7zn2xYyKtTUVHR0tLqkU0BvPZ4FJ2SAQCAw16r4RMAAAAvC0EIAACchiAEAABOQxACAACn/T/aXObrrCvIugAAAJx6VFh0cmRraXRQS0wgcmRraXQgMjAyMi4wOS4xAAB4nHu/b+09BiDgZYAARiBmA2JWIG5gZGNIAIkxQ2gmJjaGDCDNzMzIzqABZnBAaCaYQm6gAYxMDEzMGkzMLAosrAwiDOJBUGPBgO3K6YADi/eH7QNx6lZ822+Q+MkOxNZ6r7W/8cG6fVC2PZBtB1VjD1QDFgfqdQDqBYuLAQB8BCIYnTLl+gAAAOR6VFh0TU9MIHJka2l0IDIwMjIuMDkuMQAAeJx9UctqAzEMvPsr5gfWSPKrPuSwjySEEi+0m/5D7/1/KifsOoGwkgWSPWI0skG1r+nz9w+byWQMQDsn54wfR0TmippgOJ4vBePSD+vNON/K8o2IoB3qr8h+ma/rDWNE56xE8Z7RkRXKibXN0t1ar1Qk2yyJOOm7T87ljzdAV4FkY0yJpWbOcwjpDdKjYAPu4AJmbMzdDnVU6k3MjpZjmV628NjLMJep7aW6NPFawDWJWuj44+l8kKaGNUIbmjXiM+UzQa3XL9Pc/AMboGO3rkaIHwAAAG16VFh0U01JTEVTIHJka2l0IDIwMjIuMDkuMQAAeJxFjUsOgCAMBa/iUpO26QeohLjqXi/E4dUFsJ28mRcR1/3E1nc00qIpATIpV5cMDYWqOiswJTer54eYSnEX/XeWJGeHNtlCw8SljoPZP/oLklwZ8fp/JwYAAACBelRYdHJka2l0UEtMMSByZGtpdCAyMDIyLjA5LjEAAHice79v7T0GIOBlgABGIGYBYmYgbmBkY0gAiTFDaCYmNoYMkBwzIweDBojBxA3UwMTMwcTEyCDCIG4FNQAMWL4l/93feYx7H4jzoEBy//VLz+ygbHsgGywOVGMPVAMWFwMA3OcZL4rj7fIAAADAelRYdE1PTDEgcmRraXQgMjAyMi4wOS4xAAB4nH1QWw6DIBD85xRzAcnyULufKqZpGiFpbe/Q/94/XdoomjTOQrIsM+wsChm3cH29scIGpQA6WMyMpyMiNSEn6MfzJWKYu36pDOkR5zs8nCgk9sxuTtNSMRhQGc0tN8ahIm1cU5NINH1RtDYzSdfMrmW59zWdbPuH6IS48qoDokfC2vqg8xjDzvNvij7FUKbIYYtVKy58MWRk2+1zW3E+L58nufoA7oZN93HihcEAAABYelRYdFNNSUxFUzEgcmRraXQgMjAyMi4wOS4xAAB4nHN2drb1V6jR0DXUszS3NDPU0TXQMzQ2M9WxBjJMLS2NzS11DPRMTA0sjMx1rOFCuggxmEaoPs0aANxFETxYAqDbAAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'CCC=NOC'\n", + "ShowMols(\n", + " [\n", + " mol := Chem.MolFromSmiles(smiles),\n", + " remove_derivatization_groups(mol=mol)\n", + " ],\n", + " legends=['Derivatized', 'Derivatization group removed'])" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAIAAAC7/QjhAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3dZ1wU1/4G8LNLr1IE6SpFwQLSFETFgqJSolhQCEZNrMQWiZjEXIwmV7021A9JjMk1/MUS0asCNxoLIgoiRVFAEAEREEQEAUHYBfb8XxzvSqwIS5F5vh9fMMPsmd8O7j57Zs6Z5VFKCQAAAFfxO7sAAACAzoQgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJzW+iCsqqqSYB3QBeFPDABc0JogrK+v/+yzz/r3719ZWSnxgqCLqKys7N+//2effVZfX9/ZtQAAtKPWBKG8vHxubm5paemGDRskXhB0Ed99911paemdO3fk5OQ6uxYAgHbEo5S24mEZGRlDhgwhhKSmpg4cOFDSVUEny8rKsrS0FIlEKSkpVlZWnV0OAEA7auU1woEDB3766aeNjY0rV66UbEHQFXzxxRcNDQ0LFixACgJAt9fKHiEhpKKiwszMrKKiIioqys3NTbJlQSeKiory8PBQU1PLzs7W0tLq7HIAANpX60eNamhorFu3jhCycuVKgUAguZKgMzU0NAQEBBBCgoKCkIIAwAVtmke4bNmygQMH5uTkhISESKqgvzlzhnh4kEGDyMSJJDy8XXbxYTl6lEycSAYNIh4e5K+/2mMPu3fvvnPnjrm5ub+/f3u0DwDQ5dC2OXv2LCFEVVW1pKSkjU29LCmJGhrSK1doYyNNSaEmJvT0aQnv4sPy55+0Xz964wZtbKSxsVRPjyYlSXYPpaWlampqhJA///xTsi0DAHRZbb2zzPjx4ydPnlxdXR0UFCSBWG5u+3YSGEicnIiUFLGxIT/8QLZulfAuPiz/+hf54QcyZAiRkiIjR5LAQLJjh2T3sG7dusrKSjc3t0mTJkm2ZQCALksCt1jbtWuXnJzcr7/+mpyc3PbWXsjKIkOGvFi0sSGZmZJsv8VEItHq1avl5OTk5OQWLVrU2NjYKWWQO3fa9YCkpqb++9//lpGR2b59uwSbBQDo4iQQhKampv7+/iKRaOXKlbS1Y1BfQ1aWCIUvFoVCIi8vscZbLDo62traeseOHUKhUCgU/vLLL7a2ttHR0R1fSXsfkJUrVzY1NS1fvrx///4SbBYAoIuTzE23g4KCdHR04uLijh071ta2KCWRkYRSYmVF4uJerI+LI2xOW1QU6ZA+WWFh4Zw5c8aNG3fr1i1DQ8ONGzcGBwcbGxvfunVr3LhxHh4eeXl5HVAGaWwkUVGEkJcPyJUrZMiQF4erbY4ePXrp0iUtLS02EhgAgEMkdbFx7969hBBDQ8Pa2trWt5KURJ2cKCE0LIzm5FBdXRoaSnNz6dGjVF+fpqTQCxcoIdTcnLbnaI7a2tqgoCB5eXlCiKKiYlBQUF1dHfuVQCAIDg5WVVUlhMjKyi5fvryqqqr9KqHR0dTSkhJC//qLJiVRfX0aHk5zc+nvv1NdXXrvHj1wgBJC7ezolSut3smzZ8/69OlDCNm7d68EawcA+CBILAibmppsbW0JIRs2bGjN4x88oHPmUB6PEkL19Ojx45RSmp5Oly+nbm506VKamEgppefPU2NjSgglhE6dSnNyJFU/IxKJjh49amRkRAjh8XgzZsy4f//+q5sVFxcvXLiQz+cTQnR1dffu3dvU1CTZSmhODp069fkzNTam589TSum1a3TpUurmRlesoBkZlFJ6/DjV06OEUB6PzplDHzxoxa6+++47QsiQIUMaGxsl+hwAAD4AEgtCSumVK1d4PJ6CgsJrw+ONBAIaHExVVSkhVFaWLl9O397HEgpfbC8j8+7tWywpKWn48OGso2xnZxcXF/f27ZOTk52cnNj2tra2V9rQJ/ubmhoaFETl5SkhVEmJBgXR//VHX6+29sX2iorv3v7vioqKlJSUCCExMTFtrbxLaWqihw69WHz8+G/TbxIS6K+/0pMnaXl5x5fW5TQ10UuX6L59NCqKVlc/X3n6NH38+MU2hw5RiX/aA+gaJBmElNIZM2YQQnx9fVu4/d2oKNq37/N+j5cXzc1t6Z6Ki+nChZTPp4RQXV26dy9tQ2/mwYMH4h6enp5ey3t4rAfZu3dvcQ8yPz+/1WVQkYgePUoNDZ/38GbMoC3/SJGbS728nh/Jvn3To6Ja+DgfHx9CyMyZM1tZc5clEFApqReLCQnUzu75z9Om0VGj6LZtdOVKqqtLU1I6pcCuoqGBjh5NJ02i27fTxYupoSG9e5dSSu3saELCi82kpKhA0Fk1ArQrCQdhQUGBoqIij8e7fPny27fMzMycOHGisqysoE8fam7eysny165RB4fn7/729jXNX7ctw675qaioiK/5VYs/EbfYq9cUnz179r6NPEtIoPb2z5+LgwO9du19W6CU0osXqZVVjpGRnJzc6NGjU1NT3755fHw8j8eTl5e/d+9ea3bXlb0pCE+coPb2Lzo3YWHUyakTyus6fvqJTp78YnHbNurlRSmCEDhEwkFIKWXDDu3t7d+0QVlZ2eLFi6WkpAghPXv2jNm/vy2duee9qN69G6SlB5uZvVefLCIiwtjYmJ3bdHd3z215f/R1CgsL/fz8eDweIcTAwCA0NFQkErXkgaw/6mFqSmVkqJ4e3bu3TeegGhtP79/fs2dPQoiUlNTixYvLysretO3QoUMJIevWrWv97rosFoQpKc//hYY+D8Lly+nmzX/bTFr6vU4mS1Btbe2CBQvmzp179erVTimAUkq9ven+/S8Wi4qoujqllNrZ0dDQFwcQQQjdl+SDsKamZuLEidHR0a/+qqGhYe/evew9WlpaeuHChW95j37fvV7dto31yZSUlDZu3Pj2Ptnt27ddXV1ZBFpYWJw5c0YyZVB68eJF8VcXOTs737hx4y0bP3v2bOPGjewSnby8fNa2bbSmRiJlPHnyJDAwUFZWlhCipqa2efNmwevexa5cueLh4VEjoZ12LQIB5fGoj8/zf66uz4PQx4e+NDhWVbV1g4za6Ny5c+LPYdLS0oGBgU+fPu34Mui4cfTkyReLz55RPp8KhdTOjrq6vjiAPB6CELoryQchk5KSsmjRolmzZv3zn/9sbGz08/OLiYmxsLBgL3tXV9fbt29LfKct6ZOVl5cvX76c9Uc1NDSCg4MlPlSyqakpNDRUW1ubEMLn8/38/B4+fPjqZhEREX379hX3R/Py8iRbBn0l7xMTEysrK9evX+/t7b1s2bI7d+7s37//xIkTEt9vl/CmU6NffUW/+urF+vJyKi9Pm5o6ciRIVlaW+CZ26urqOjo67D+toaHh4cOHW3giQQLYU543jwYHv1iZkUENDSnFqVHgkHYJwtLSUh0dnWPHjl2/fv3//u//KKVpaWlZWVlycnJmZmZHjx5tj52KxcTEDPnfrciGDRuW8L8Xs1AoFPdHZWRkJNkffR3WJ5OTkxP3yerr69mvbty4MWrUqPboj77WuXPnBg4cKCsrm5WVNW3atM8///zGjRuRkZF379598ODBo0eP2nXvneZNQXjrFjUwoMXFz9evXk3nz6eNjdTBgQYG0ve/Qvxe3vS/4tq1aw4ODuy/hL29fXx8fLuWQSml587RQYNoYiK9cIFaWNCKCkopbWqic+bQr7+mFEEIHNIuQRgTE2Nqatr8g+2CBQuePn168eLF156gkzjWJ+vVq5e4T/bNN9+I+6MuLi5paWkdUAal9M6dO+7u7my/pqama9eube/+6GsJBIKLFy9SSo2MjJpPCwkLC4uMjOyAAjrBW0aNhoTQ3r2pmxu1sqIuLrSsjJ49+3wOq6EhPXSItkOf7NX/ky+dJxCJRKGhoTo6OmwEsp+fn+S/0YXJzKQTJz4fljVnDqWUbthAe/emHh50wAA6dSplZ2gRhMAZ7RKE1dXVFhYWY8aMOXr0KEs+fX39x83nJHWIJ0+erFq1SkZGRnwbnf79+0e1eF6BBEVFRTW/gaeMjMyqVauePHnS8ZWsXbtWX19/06ZNxcXFbHHnzp0dX0YHaf5frqGBVla+WGxspDk5f5uBmphIHR3FI5CpRPtkzc9SvP3KcU1NTVBQEOsyKikpNb+rkQQ8eUIDA6msLCWEqqnRzZvp/85S0IYGmp39t0vUlZW0oeHFYoe/fgE6THtdI6ytrQ0LCxs9erSNjU1NTU2nBCGTlZVla2urpaXl5eUlFAo7pQZKqVAo9PLy0tLSsrW1zcrK6qwyKKXx8fFLly5VV1ePiYnp5kH4vkQiGhpKdXSez+P086Nt7pO1bizx3bt32ZRcdiJBAlcTmppoaCjV1qaEUD6f+vnR0tK2tgnQXUg4CPPz85u/y4tEIktLy9OnT3diEMJr/etf//Lx8WFBmJWV1ab7AHQz73tnnzdgs0sVFBRaPbv0woULgwcPZnE4duzYW7dutaIMSp/PLn3e2R09mr5rdikA10gsCGtrazdv3qysrDx8+PDi4mJ28qegoEBHRyc9PR1B2EVER0fX1dUJhcKPP/74q6++YkE4atQoBQWFwMDAVtxMoNu6e5d6erLwODx16snmEwzeRYL3G2IzjrS0tMQzjt5rcFNeXt6ORYueX/7s04eGh7euDIDuTQJBKBKJDh48aGBgwF72Pj4+165dc3NzGzBggJWV1b59+yilfn5+eJPtdE1NTUuXLrW0tDQ3N583b151dfX+/fsPHz7s4+MjPnd38ODBjhu+3/WdPVvh4NBTU5P1yW7evPnOR7x0B9p33mKpJSoqKpYvXy4tLc2mWwQHBzc0v3r3Os3vdpQ9diz9/vvOumkAQNfX1iBMTk4eMWIEe9nb2Ni89mUvFAp37tz5zTfftHFf0EY//vjjm266lpiY6OjoKB6+/84bjnNH8z4ZG+35pj5Ze38nSWZmpnj2obm5+Z9v+CayFn6DCgCItT4I2ctefKe0t8wESE9Pl5aWlpaWTk9Pb/XuoI0yMjKkpaVlZWWLxVPo/u6l4fszZswoKCjo4CK7rJf6ZC/dqUcoFIq/pVJGRqZdv6UyIiLCxMREfB+GnL9/E1lSUlK7fCMKQLfWmiBsxcve39+fnVxqVZEgAewWM/7+/m/fjA3fF9+sTsLD9z9wWVlZkydPFk/F+e9//0spPXXq1FuSqT28+gJ88uRJR3xHJkA39d5B+PYPpG9SUVGhqalJCDl16tT7FwltdfLkSdaVaeHNdO7fv+/n58f+yoaGhqGhoe1d4QfkxIkT4pcAGxRKCBk0aNB59uXJHaW4uPiTTz5hF3elpKTYZxd5efmvvvqqc+5ZCvDBeo8gbH6Jon///m+6RPEmu3btIoSYmJiI7zQGHUMgEPTr148Qsnv37vd6YHR0tKWlJfuLjxkzpiVDRTiC9cmUlJR4PB6fz9+5c+c7R6+0k+TkZHY5UFZWtu3foALATS0KwlYMWntVQ0MDmxS1ZcuW968TWm/Lli2EEAsLi1bcT4DdGKz5UJFSTMT+n5s3b4aGhnbAudC3a2pqOnjwYEREROeWAfDhekcQtnEa00vOnz9PCFFRUXnTeA2QuNLS0h49ehBC2nJr74qKCvGXOr06VAQA4IP2tiCU2I0tmvH09CSEzJ8/v+1NQUvMnz+fEOLp6dn2pl47VAQA4EPHo5SS17lx44aNjQ271eG2bds++uij1272vnJzcwcOHNjQ0JCQkGBvby+RNuFNbty4YWdnJy0tnZaWxi4Ttt2pU6cCAgJycnIIIdevX7e2tpZIswAAneWNQUgI8fPzGzx48IoVK9i98CVlzZo1W7dudXR0jIuLY2PeoD1QSp2dnS9fvrxmzRp2mVBSBALBrl270tLSDhw4IMFmAQA6xduCsJ08ffq0f//+JSUlhw4dmj17dgfvnTsOHTrk6+urra2dnZ3NLhMCAMCr+B2/SxUVlY0bNxJC1qxZU1tb2/EFcEFdXd3XX39NCNm0aRNSEADgLTohCAkh8+bNs7e3Lyoq2rp1a6cU0O1t2bLl/v371tbWc+fO7exaAAC6tE44NcpcvXrVyclJXl4+MzOTfWENSEpRUZG5uXltbe2lS5dGjRrV2eUAAHRpndMjJIQ4Ojp6e3vX1dWtXbu2s2rorgICAmpra2fPno0UBAB4p07rERJ0XNpHfHz8iBEj0NUGAGihTusREkIMDAwCAgIIIStXrmxqaurESroNkUi0YsUKSmlgYCBSEACgJTqzR0gIqaurs7CwuH///r59+z777LNOrKR72Ldv38KFCw0MDLKyspSUlDq7HACAD0Bn9ggJIQoKCps3b9bS0hJ/nQ20haKiopaW1tatW5GCAAAt1Mk9QkIIpfTp06fsW0ah7aqrq1VUVHDLHgCAFur8IAQAAOhEnXxqFAAAoHMhCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnIQgBAIDTEIQAAMBpCEIAAOA0BCEAAHAaghAAADgNQQgAAJyGIAQAAE5DEAIAAKchCAEAgNMQhAAAwGkIQgAA4DQEIQAAcBqCEAAAOA1BCAAAnIYgBAAATkMQAgAApyEIAQCA0xCEAADAaQhCAADgNAQhAABwGoIQAAA4DUEIAACchiAEAABOQxACAACnSa1fv76za4BuqKioKDExMS8vLy8vr7y8XEVFRU5O7n0buXr1Kp/PV1VVbXs9qampNTU1GhoabWwnLy/v3r17urq6bS8J3ousrKympqa9vX1nFwLdEHqE0C4iIiLGjx8/c+bMmTNnOjg4aGpqzpo1q7y8/L0acXd3P3r0aOsKyMnJaf4hb9GiRTt37mxFO5TSJUuW1NbWssWQkJB58+a1riQA6JoQhF1XdnZ2eHh4Q0NDZxfSemVlZRUVFdXV1VFRUfHx8a6urkKhsOUPv3///ueff966Xd+6dat5iF68eHHHjh2taKesrOznn3+ur69ni//85z+vXr3aupIAoGuS7uwCWkMgEAQFBbGf+Xy+pqbmsGHDnJyceDze+za1c+fO3r17e3l5Saq2kJAQ1vtpe1ORkZEBAQFVVVUyMjJtb60TKSkpubq6Hjt2bNiwYYcPH/7kk08IIfX19WfPns3JydHX158wYYK6urpAIIiIiPDw8IiMjCwsLJw1a9aVK1cGDx5sYWGRkJDw7NmzsWPHsgYppcePHx86dKiRkdHt27fj4uKqqqr69Onj6ekpKytLCImKijpz5kx1dXV4eDghxNXVNS4uTlNTc+jQofn5+UlJSc3Lc3NzU1RUfPjw4fnz5x88eKCjo+Pu7q6pqUkISUhIiI+PJ4ScOnVKRUXFxsamsrLy4cOHbm5u7LH5+fmxsbHl5eUWFhbjxo1jf6nTp09bWVkJBILIyEgejzdp0iRTU9M3HRyBQBAdHZ2VldWjR4+ePXsSQjw9PbOzs0tLS83NzY8cOaKkpDR//nxCSFFRUWxs7MOHD83MzCZMmMBONV+4cEFNTc3W1pa1FhcXRwhxcnJKS0urq6vT19f/66+/hELh8OHDLS0tX1tAVFSUvb19ZmZmYmLipEmTBg8eTCmNjY29deuWkpLSmDFj+vbty56UpaVlTU3Nn3/+KSUl5eHh0bdv3zt37pw+fVpOTs7d3d3Q0FD817l8+fKtW7fk5OQcHR0HDRpECKmqqjp79qyzs7O2tjbbLD09vbi4eMKECYSQnJyc2NjYmpoaa2vrkSNHimsrKiq6cuVKYWGhnp4epbSF/98A3hv9AFVVVRFCTExMXFxcRo0aZWJiQghxcHAoLCx836bGjh375ZdftqWYH374YevWreJFd3d3f3//tjQotm3bNkJIVVWVRFrrYCEhIYSQxsbG5itNTEy8vb0ppQ8fPuzfv7++vr6vr6+pqamhoWF5eXlpaSkhZOTIkfr6+lZWVjk5ORoaGjt27KCU7tmzR15eXnwo4uLieDzevXv3oqOj1dTUPDw8/Pz89PT0bGxshEIhpXTWrFlGRkZKSkouLi4uLi75+flDhw5dunQppfSvv/5y+Z9evXopKSlVVFRUVlb26NFj3Lhxfn5+lpaW6urq9+7do5R+//33LD+cnZ1dXFwiIyO/+OILS0tLVsaRI0dkZGSGDRs2c+bMnj172trasgpNTEwmTpxoZGTk6+tra2uroKBw+/bt1x6le/fumZmZ9erVy9vb283NTVNT08nJiVK6Y8eOPn369OnTZ/DgwV5eXpTS06dPKygoWFtbe3t76+joWFhYPHr0iFJqb2+/cOFCcYMeHh6enp6U0rVr1/bt21dbW3v69Onjxo3j8Xg7d+58bQ0GBgYjR45UV1e3sbGJiIhobGz09PRUVlb29vZ2cHCQk5OLj4+nlJqZmY0dO9bIyOjjjz+2tLRUUVHZsGFD79692aKqqmp+fj6llD1cUVHRy8vLxcWFz+dv2rSJUtrQ0KCtrf3dd9+J9zt69OjFixdTSn/77TcpKakxY8bMmDFDRkbmq6++YhscOHBATk7O2tra19d31KhRPB4vJCSkxf8BAd7DBxyE33//vXhNTEyMlpaWpaWlQCDo4GI8PDxWrFjRHi13vyB0cXFxdHSklC5cuNDAwODJkyeU0srKSg0Nje+//54F4aeffireXhyEjx49kpGRCQsLY+tXrFgxcuRISmlTU1N9fT1befv2bULIX3/9xRZXrVplYWEhbkochGJ3795VVlb+5Zdf2OKzZ8/YDwKBQFdX99tvv2WLx44dI4Q8fvyYLYqDsKampkePHgsWLGDr79+/r6Ki8vXXX1NKTUxMHBwcqqurKaVCoVBHR0f85v6S8ePHm5mZiRufO3euOAj5fP758+fZ+oaGBj09vRkzZjQ1NbGjoaWltWTJEvrWIOTz+VevXmXrAwMDZWVlWXa+xMDAYPjw4XV1dWzx999/l5KSun79OqVUJBKNGjVq0qRJlFIzMzMHB4enT59SSmtra1VUVIYNGyZeVFVV3bx5M6U0NDSUx+Ox7KSUbtu2jc/nZ2dnU0qXLFlibm7O1peUlEhJSV26dKm8vFxBQeGbb75h63fv3i0vL19WVlZYWCgrK7ts2TKRSMR+JSMjgyCEdvJBnhp9lbOzc0hIyMyZM8PDw319fQkhIpEoIiIiKSmJx+ONHj3axcWFEJKZmZmWlubm5vbvf/+7rKzss88+u3Lliq6u7pgxY3Jzcy9cuDBlyhTxqZuSkpLIyMipU6dqaWmlpKRcvny5tLRUT09vypQp7CzQnTt3Ll26lJGRUVtb+8svvxBC5syZExkZ2aNHjwkTJlRXVx85cqR5kb169froo48IIU1NTadOnUpOTubz+WPHjhWf8SOEVFVVHT9+PDs7W01N7ebNmx11/DoIpZSdvo6Li+vTp4/4Gp62tnZSUtKCBQsIIe7u7q8+UEtLy8XF5Y8//vD19RWJRMeOHfvmm28IIXw+X1ZW9saNG4WFhfX19Twe7/79+y2ppKGhwdfX18XFhe2UEKKgoJCXl5eVlVVbW9ujR493tpOenl5VVbV48WK2aGRk5O7ufvnyZbbo7OysoqJCCJGRkenbt29xcfFra4iJidmxYwc7DfsSRUXFcePGsZ/z8vKKi4sXLVrE5/PZ0ZgxY0ZsbOzbK+zdu7eDgwP7ed68eVu2bLl165a4zebGjx8vLy/Pfo6Pj2d/DnYCWVVVNSEhgf1q5MiRysrKrDZ9fX1HR0fxooGBQUlJCSEkLi5uyJAhjo6O7CFLliz58ssv4+LizMzMfHx8fvrpp1u3bllaWh47dkxPT2/EiBHR0dF1dXXS0tLsFfT48eP6+vq0tLTi4mKhULh+/fpWXO8AeF/dJAgJIVOmTJGRkTlz5oyvr69AIJg8efKVK1cmTJjw7NmzH374Yf369UFBQbGxsQEBARYWFk+ePCGE+Pr6rl+/3snJacyYMYqKikuXLn369Onq1atZg7/++uvOnTs/+eSTEydO+Pr6Ojk5aWpqHj9+PDAwMDY21s7OLj8/Pzw8vKCgoKGhgV2L8vb23rRpk4mJyYQJE6qqqthrmxAiEAjS09OnTJny0Ucf1dfXT5w48erVq66urk+fPv3hhx82bty4bt06QkhiYqK7uzuldPjw4UKhMDU1tZOOZXu5d+/esGHDCCElJSWKioopKSls/ahRo4yNjd/+WB8fn/nz55eXl9++fbu0tHTatGmEkLt373700Ufl5eVWVlZqamqEkKamppZUsm7duqKiov/+979ssba2dubMmTExMfb29j179iwvLxeJRG9vgb3v9+rVS7xGR0cnMTHx1S1Zer2qqKiooaGBXYFrxb5eG67NNY8QdnAqKireua+HD2tjk90AAAoSSURBVB9KS0uL/zR6enrTpk179Wi89KTEiyUlJeKPkoQQRUVFVVVVVqqTk1Pfvn3/+OMPS0vLo0ePzpo1i8/nP3z4kBBSUFDAniMhZOHChcrKyvn5+Wpqam2f7gLQEt0nCGVkZAwMDPLz8wkhu3fvjomJiY6OdnZ2JoSsWLFi69atbPxhTU3N7NmzV61a9dLDdXV1R48efeTIEXEQhoeHT5s2jQ0EePjwIZvNJhAITExMgoODw8LCXF1dXV1d1dXVvby8goODX2rQ0NAwOTmZ/bxy5crCwkI2fD84OPjy5cuXLl0aMWIEIcTf33/Lli3+/v5KSkozZszQ1ta+ePGilpYWIWT79u0BAQHtdbw63LVr1/Ly8r777jtCiJGRkY2Nzc8//9x8g0ePHr3l4VOnTl28ePGJEydu3rw5fvx49m67YsUKTU3N1NRUNkZGSkqqJZXExsZu3749MjKSDU4hhOzatevatWu5ubk6OjqEkNGjR7+zEXZWoLCwUF9fn60pKCgwMjJqSQGMtrY2j8crKysTr3nTkFrWbGFhIRt4Qgi5f/9+7969CSE8Hq8l2Z+VlUUIecuYHTFDQ8Pc3Ny9e/e24Bm8/uGXLl0SL1ZWVlZXV7P6eTzerFmzjhw54u/vHxcXt2vXLkKIgYEBISQgIGDAgAHN20lNTa2urq6vr2ddVXaOvXUlAbxTt5o+IScn19jYSAiJjo42NjZWVlZOSUlJSUmxsLCora29ceMG28zb2/u1D/fx8UlOTr579y4hJCsrKy0tbfbs2YQQGRkZVVXVxsbGBw8eZGRksHeKlld19uzZ3bt3//jjj3369GG1mZmZKSgosNoGDBhQU1OTmpoaGxtbUFDw7bffshTsToRCYXR0tLe3t729PTv4Li4uR44cER/GR48eiSfqvYmSkpKnp+fhw4ePHz8uHpRbUlJibm7OUpBdHRRvr6mpWVpaKhAIXmqnsrLSz89vxYoVkyZNEq8sLi7W1tZmXa6cnJycnBzxr1hYFhYWvtTOoEGDdHR0tm/fzv7LpaWlRUVFsTGQLaSkpOTg4LBv3z42N+POnTviHupLevfubWZmtnPnTvZ0cnNzw8PDx48fTwgxMDBgI3EIIZmZmdeuXXv14U+fPg0KCjIzMxs8ePA7q3JxccnIyIiIiGCLdXV1rNPWQuPHj8/IyIiMjGSLmzZtkpWVFX+w8PHxycvLW7NmTb9+/aytrQkhdnZ26urqW7ZsYXFOKb137x4hxNnZmc/n//jjj+yBu3fvZscZoD10nx4hpbSgoMDCwoIQkp+fX1JSMnPmTPFvjY2Nq6ur397C9OnTP//88z/++GPdunVHjhzR1dVlHcqysrI1a9YcOXKkvr5eXV29pqbmTSPRX/X48eO5c+f6+Pj4+PiwNW+qjfUMxB/5uwcW6jU1NU1NTdOmTfv555/ZBIOgoKDk5GRLS0tbW9u6urrMzMzw8HDxHIA38fHx8fDwkJeXnzJlClvz6aefst52ZWUlpZR1Lxhvb+9t27ZZWFhISUmJ39YJIZs3by4oKDh79qydnR1bs2fPHj8/v99++83BwUFFRSU/P795x87BwcHKysrFxUVfX7/5iQQ5ObmwsLDp06ebmpoaGBhcv37dxcXl1TMNbxcSEuLp6WlmZmZiYpKVldW8/ub4fP6BAwc8PT1NTEyMjY2vX79uZ2f37bffEkL8/f0nT55sZWWloaFRUFDA0oXJz8+3sLDQ0dFJT09XVlY+efKktPS7X+9TpkxZtmyZl5eXjY2NrKxsWlra6tWr//GPf7TwGU2ZMmXp0qXs4TU1NXl5eT///LP4eQ0aNMjS0vLgwYMbNmxga5SVlcPCwmbPnh0bG9u3b9/c3Fw1NbXU1NR+/fpt2rQpMDDw0KFDjY2NSkpK4p43gMR1nyBMSkp69uwZm4Skqampp6d34cKFl7Z5+wkfVVXVSZMmHTp0aN26deHh4bNmzWKn2hYuXJiQkHD69OkRI0ZIS0t7eHiIr2e806effiotLb1nzx7xGk1NzT59+pw9e/alLdnoxFd7MB8oT0/Pfv36sZ8VFBTMzc2bDwlRVlaOjo6+efNmVlaWiooKu/WMUCg8d+6clZWVeLPjx483v3bo6up67tw5VVVV8U3XPv/882HDhqWkpBgbG48dOzY9PV18ttPU1DQ7OzsuLk5dXd3U1HTPnj3sUXPnzmUjp8T69eunqal5+/btCxcuaGhojB07trKysqamhv1WTk4uISHh4sWLlNIRI0YMHz58+vTp7Ffjxo0rKChISkp6/PjxgAEDxB9iQkNDm1/PCw4OVlBQeO1Rsra2vn37dkJCQnV19YgRIwIDA1kvedq0aUOGDGm+5bBhw/Ly8pKTk9nMEysrK3YJcOzYsZmZmfHx8crKyi4uLmVlZWxMNSFEV1d3z549paWlpqam1tbWrN/8qkOHDr2UMbt37161atX169f5fL6lpSWbnvT77783P1fx66+/Nv+D7tu3T11dnf0cEhKyevXqGzduKCkp2dravnSG448//igqKmoe2JMnT2aHsbS01MTEZOjQoeypBQQETJ8+PTk5WUtLy9HRMT09vfnVRwBJ6qzhqm3x6vSJJ0+e2NnZsWEOlFJ/f39lZeXS0tKXHsguSj148EC8xszMbO7cueLF48ePE0IOHTpECElMTGQrtbW12VB1xtnZ2dbWVryoo6OzaNEi8aK1tfX06dPZzz/++COfz2fvoWKLFi1SUVEpKyt7qbb09HRCSPMpiWya/wc6fQJaYdy4cbNnz5ZIU2vXrjU2NpZIUwDd3gfcI7x8+fKWLVsopQUFBf/5z3+ePXt28uRJNszsiy++CAsLmzBhwpo1a3r06JGRkXHz5s2DBw++s83JkyerqaktW7bM1NRUfHtfCwuLM2fOnDlzpqmpaf/+/fHx8c1Pjdrb2//nP/9xdHSsrq5eunSpeH12dnZAQICtrW12dnZ2djYhREZGZt68eatXrz58+PCECRMCAgJUVVXT09MzMjIOHDgwcOBAV1dXNl7c3Nz85MmTb7piBN1GcHAwm96jqKgYExNz8eLFVt9bFQBa7YMMQikpKVtb28ePH7NJCxoaGgsXLly0aJH4DI+xsXFcXFxQUNCaNWukpKT69evn4+MjEom0tLRsbW2b37FswIABbPQdIy8v7+/vf+bMGT8/P/HK3377bfHixdOmTdPQ0FiyZImbm1t0dLT4tyEhIStXrty4caORkdGsWbPMzc3ZFZG7d+9aWFiIRCLxJApFRcV58+aZmZmx2r788ktpael+/fp9/PHHIpGIz+cfO3bsH//4R1hYWGlpqYuLS0RExNdff93CkZDwIdLX19+3b9+WLVuePHliZmb2008/sWkhbaegoNCjRw+JNAXQ7fEoBiUDAACHdavpEwAAAO8LQQgAAJyGIAQAAE5DEAIAAKf9P1Ji9Tin2F7QAAAAz3pUWHRyZGtpdFBLTCByZGtpdCAyMDIyLjA5LjEAAHice79v7T0GIOBlgABGKJsHiBsY2RgSQGLMfAoKQJoFxoXRHAwgYSYIlwmDhkrj0AyjuYF2MjIBGQyMLAwsrAysbAxs7AzsHAwcnAycXAyc3AycPAwiDOLLoK4DA17NrVMOZD+ZZw/iBGVVHjjql7MXwvY8ELFy7X4Q+6HbsgNvjvhB2WoHRKwN7aHi+2Hqze6xLEUSt4eJA9U7wMSB5jvAxIH2OiDs9XRAsssBZpcYAGqqPNf8ArXxAAABQHpUWHRNT0wgcmRraXQgMjAyMi4wOS4xAAB4nJVT227DIAx9z1f4B4IwBgIPe8ilmqatibR2/Ye+9/81OxEz3bopgzjCzuEIznEakPE+vV5v8DXc1DQA9o8n5wwXstY2R5AFDIfnlxnGcz+Uyrh8zOcTIAE63sPzHtufl2OpIIzQehN9h+gBjfNIPoI1dh261cHpCi2ZlFdky4jQxewqKFz6tydfNpAwk8GNuUVDIXZc/0ntBRkM5ojBCXOy35gLMsACrTMhC4K/M4psegCMQsm3ydlSenjYguwEqeXfKRMDdzFmPuSeM6IVSf+hKIpZe7xi20fYJT2KS7ukP8zTXeNsrTQs86StJNNpw3ACpO0gqVfPJQ1qLHJEtQ85OvUIOZI6gRxZ9UaJWldcX1gJt1VcpdBWofp+9W0kLz8Vr5tPsQexA2k/LwQAAACVelRYdFNNSUxFUyByZGtpdCAyMDIyLjA5LjEAAHicbY4xDoAwCEWv4qhJS6AtbYkjB3BwNB7AO3h4i4uamjDA4/+fr9t67KNObRZVXZ5Th3P0CXIqRI4gJIrsZh+hihGPgFyyhHhDumWeIHIuhhhIMrHpKpqusQAsFatDMF9b5mYIIpg+cfgS/P37mL9SXfW+ZN9xOi/RMzgX2uomZwAAAHl6VFh0cmRraXRQS0wxIHJka2l0IDIwMjIuMDkuMQAAeJx7v2/tPQYg4GWAAEYgZgViFiBuYORgSACJMbKBaSYmdBomzw3Ux8jEwMTMwMzCIMIg7gY1CQxYH7qpHZg1c6YdiPPQbdn+tLRn+2CSSOL2MHGgegeYuBgA+ggZu16bmz8AAADDelRYdE1PTDEgcmRraXQgMjAyMi4wOS4xAAB4nI1QSQ7CMAy85xXzgUbOBs2xbSqEUBMJCn/gzv+FIwhpD0W1Y8m2xqOZCOS4hsvzhV/oIARAf573Hg9DRGJCbtCPp3PEMHd92QzpHucbHCxfcK6R3ZymslFIaLR0vqXjASQzL/GFpG9TgBoDGiW192RaNCSt20AaRtbtNqNl3C5Cxxr3SBxjWHn7uO1TDNVtTl098QBThSsuW/UpLrdkX3Llufw59+INAjhXi+HGZdgAAABQelRYdFNNSUxFUzEgcmRraXQgMjAyMi4wOS4xAAB4nPN3dnb2V6jR0DXSM7W0MLDQMdAz1rHWNdQzsrQ0MNHRNdAzMdWxNoAIo4kia9GsAQBHyA9E7zip7AAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'C[Si](C)(C)OCCCO[Si](C)(C)C'\n", + "ShowMols(\n", + " [\n", + " mol := Chem.MolFromSmiles(smiles),\n", + " remove_derivatization_groups(mol=mol)\n", + " ],\n", + " legends=['Derivatized', 'Derivatization group removed'])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAIAAAC7/QjhAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3deVyM+R8A8M9MJ0UqXY4OQlKREiqhsq6k1IjSWvw2x5J77bpid13LrrKs1S4r13bIUcgdIpKQ5MiRpJRKpXOmms/vj+8YOZaoZmzzeb+8vHqO+T6fmXnm+Tzf43keDiICIYQQIqu40g6AEEIIkSZKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiVCQgghMo0SISGEEJlGiZAQQohMo0RICCFEplEiJIQQItMoERJCCJFplAgJIYTINEqEhBBCZBolQkIIITKNEiEhhBCZRomQEEKITKNESAghRKZRIiSEECLTKBESQgiRaZQICSGEyDRKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiVCQgghMo0SISGEEJlGiZAQQohMo0RICCFEplEiJIQQItMoERJCCJGg0lIoKZF2EK+Rl3YAhBBCZENSEvzvfyAvDwBQXQ1//QVWVtKOCQCAg4jSjoEQQkhTJxBAt27wyy/g6goAcOAAzJ8PqamgqCjtyKhplBBCiATEx4OamigLAoCbG7RsCRcuSDUmEUqEhBBCGl9GBhgavjanQwfIyJBOMK+jPkJCCCGNr3VrKCp6bU5hIbRuDRs2wJMnMGYM9Owppcioj5AQQogE5OaCmRmkpICuLgBATg6Ym0NqKtjbw717AACGhuDqCuPHSz4jyi1btkzCmySEECJDCguhqAh0dUFBAaZPB6EQEhLgm29gzhxwdAQrK1BRgceP4ckTSEiA4GDYvx8KC6FtW1BXl0yAVCMkhBDSaNLTYfhwkJeHuDhQU4MbNyA+HgDA1hYsLF6tVlMDZ8/CP//Avn3w/Llopo3Ni3HjXri7t2vXrlFjpERICCGkcSQmwogRkJsLFhZw9Cjo6X34JTU1cPEi7NwJoaHw4sVmB4fp58/37duXx+N5eXnpsmbVhkaJkBBCSCM4ehRGj4aSEnB2hr17QU3t415eUQFHjvwYG7tq27aKigoAkJOTc3JyGjNmjLu7e6tWrRowUkqEhBBCGtqff8K0aVBdDRMmwJYtoKDwySVVVFScPHly586dBw8eFAgEACAnJzdw4EBfX193d/cWLVrUP1hKhIQQIhN27Nhx+PBhR0dHHx8fVVXVRtqKUAhVi5crrVoGHA78+CMsWtRQJRcWFu7bty8sLOz06dM1NTUAoKamlpOTo6ysXM+SKRESQkjT9/fff0+cOFFOTq6mpkZZWdnZ2ZnH43l4eKioqDTgVvh8GD8elFMS/84ezPltA4wb14CFiz1//vzQoUM7d+5s3rz5wYMH618gJUJCCGnifvzxx4CAAEQ0NDRs165dfHy8UCgEgJYtW7q5uY0ZM8bZ2VmhHq2XTF4euLrCpUugrg7nDhaa9Wv0ix+qq6vl5RvgtjCUCAkhpMmqrq6ePn36li1b5OTkAgMDp0+fDgBPnjyJjIyMiIiIj49nKUBdXd3FxYXH4w0dOvTTUsuDBzBsGKSlgZERHD4MXbs28BtpVJQICSGkaSotLR09enRMTIyKisqePXtcXV0BoLy8vHnz5myFjIyMAwcOREREXLhwAQA4HG63bhW2toq+vmBnBxxOXTd08SKMHAl5edCrF0RHg45O47yfRkOJkBBCmqDs7GwXF5dr167p6upGR0dbW1uz+T4+PvHx8V5eXuPHj+/6suKWmpoaGhp65Yr80aMBbI6BAXh5wZgxYGn5gQ3t3Qu+vlBZCa6u8M8/8DLJ/pdQIiSEyKiysrKGHSrybnl5sGkTJCeDpiYMGQKeno2+RYCUlJThw4dnZmaampoePnzY8OVjH4RCYefOnR88eMAme/ToMWbMGC8vL/EKqakQEQG7dsHLVUR3AP3qK1FG/O47SEmB6GjgcgEAzMwgKAiGD4fx42HTJmiIDjtpQEIIkT179+7V0dG5fv16426mshJNTHDZMkxNxdhY7N4dN21q3C0iHj9+vGXLlgBgZ2eXl5f3xtKampq4uDh/f38tLS1xIjA1NV29enVWVhZbRyjEuDicPh11dBBA9M/CArdswUmTUFsb//hDVFrz5sjnY2pqY7+nxkWJkBAii8aOHQsA+vr64qN/o9i6FV1cXk2mpmLr1igUNuYGt7Lxnzwer6Ki4j1r8vn8qKio2tcUtm5t4ugo/OMPFGfPmhqMi0N/f9TWRgCcNw8nTcK1a7F9e8zJQXyZCP/rqGmUECKLBALBkCFDYmNjzc3N4+Li1D72BmB1NGsW6OjA99+/mqOhASkp0LZtg28KEZcvX758+XIA8Pf3DwwM5NRtuEtFRcWhQ4dCQ0OLisacPs0DADk56NMHeDzw8YHWrQEABAI4fhw6d4affwZHR8jMhJQU2LULVFSgsBAUFRv83UiWtDMxIYRIR0FBQZcuXQBg6NChVVVVjbKN6dNxzZrX5mhp4aNHGBuLz5414HYqKyt9fHwAQF5e/g9xw+VHKivD8HB0cUFFRVFzqJISurhgSAi+eCFaZ9Ik3L0b+Xw0NcUzZ5pIjZAr7URMCCHSoaGhERMTo6OjExMTM3Xq1EbZhokJXLv2ajIjA/h80NUFDw/Q0wN7ewgKgvz8em6ksLBw8ODBu3fvVlVVjYqKmjx58qeV07w58HgQHQ1Pn0JwMDg6QnU1HDoE48eDnh6Ehr5aU1ERNm2COXOgaTQpUiIkhMguIyOj6OhoFRWVv/76a82aNQ2/gS+/hPPnYc8eqKqC7GyYPBnmzoWCArC1BTk5uHABZs2Cdu3AzQ3++QfKyj5hC+np6ba2tmfPnm3Tps25c+eGDh1a/6g1NODrr+HUKcjNhZAQcHaG8nIwM3ttnQEDoGtXqKio/9Y+A9KukhJCiJRFRUXJyclxOJydO3c2TImlpejmhgcOICLeuoVjx6KFBdrb4+rVWF0tWqewEENC0MUFFRREDZHKyqKGyNLSOm4nISFBR0cHAMzNzR8/ftwwwb9LZqboj5AQTEgQ/f30Kfr5YSM1KktSow2WQYSzZyE5GVq0gIEDwcioUbbyXrGxsd9+++2ePXs6deok+a1/vh4+hDNnoKQEevQAB4ePuHsEIU3X+vXr58yZo6ioePTo0YEDB9arrJwccHGBpCQwMIC0tA+PJMnNhYgICA2F+HjW1PjL4MEpurpeXl7vvwXogQMHfHx8ysvLBw0atHfvXnbJBPkUjZVhfXywf3/csgVXrEA9PYyObqwNvcv9+/fd3NzYG5w6daokN/25i47GNm1w1SrcsgX79cNx46QdECGfC39/fwDQ0NC4c+fOp5eSmooGBgiAHTrgx5bz+DGuXYvW1l07dmSHL3V1dV9f36ioqLfH8gQGBnK5XACYOHGiQCD49IBJY11HGBuLJiavWgDi4tDQEGtqGmVbrystLQ0ICGCPp1JRUQkICHj/lTSypboaDQwwPl40WVWFnTrhmTNSjUm2xcbiypW4ciWePi2ak5iIx469WiE6GpOTpRKaDKqpqXF3dwcAIyOjHHaV3Mc6dQpbtUIA7NMHc3M/OZJHjx4FBgba2dmJayyampp+fn5xcXFCoVAoFC5YsAAAOBwOe6YEqafGSYQ//IDz5r02R0sLMzIaZVsvCYXC8PDw9u3bs/2Dx+NlZGQgokAgmDp1qru7++7duxs1gA/avXu3u7v71KlTpXb6lp6OOjqvzZkzB3/6STrBkMWL0doad+zAkBC0tMTFixERg4Jw+vRX64wfj1u3SitAGVReXt6nTx8AsLa2Lq1zRx0jDAkRXXbg6YkNdP6dkpKyePFiY2NjcUY0MDDo1q0bACgpKe3Zs6dBtkIaJxHOmYM//PDaHGNjvHatUbaFiIiXL1+2tbVlO0qvXr0uXLjA5p84cYLtNCw7+vr6Pn36tPHC+Df5+fn+/v7ii1s7deoUHh4u+TDw6lXs1Om1OcuX45w5UoiEZGSglhYWFoomCwpQUxMzMykRSl1eXh5LPC4uLtXiZq0PCQwMnNivHwKgv39jtH7dvHkzICCgY8eOLBeqqKicobachtPQibC6GqurccMGHD/+1czycmzWDIuKsBFqQllZ6OeH/ftPA4A2bdps2bKlpqYGEW/fvj1kyBBx4unSpQvrc1ZTU1u7di1fUpeA8vn8tWvXsptWKCgodOnSRTxyZ8iQIbdv35ZMGIiIAgE+f47Nm792rurrixs3ir41Ikn79+OQIa/N+eILPHAAg4Jw2DDct0/0b+BASoSSd/v2bQ0NDQCYPXv2B1cWCAQTJkwAADk5ucTt2xs1MKFQuHHjRgDo0aNHo25I1jRoIjxzBrt3x82b8elT1NbGGzdE85csQU9PFAqxf3/09cUGqpOVl+MPP6CKCgJgmzb5S5YsY00ZhYWFCxYsUFRUBIBWrVqtXr26srISEdPS0ng8HktCxsbGEqiTnThxwtTUlG3R2dn55s2biCgQCLZs2dK6dWsAkJeX9/Pze/uuuA0sPx/9/dHeHoVCdHdHcadCcjJqa2NuLv7+O3btikePNm4YpLaQEPTweG3OqFG4YwcGBaGVFX7/veifuTklQqk4d+6ckpISAAQFBb1ntZKSEnbdnoqKSlRUlAQCKy4u5nK5SkpKNECmATVQIkxPR09P0aUw1taIiFFRaGKCffuimRl+8QU+fYrXr4sa0Fu2xDVr6nlbnqgoNDISbdDFBR8+RESsqakJCQnR1tYGAC6X6+vrm/tWf/XJkyfNXl4X6uTkdEOcrRvU3bt3hw8fzrbSuXPnQ4cOiRcJhUJELCgo8Pf3l5OTY6PUAgMDG+UOT3w+rlmDLVsiACoq4vXrmJ2Nzs5oZoZ9+2LXrsgCs7YWfZSenpie3vBhkLfFxaGZ2WtzTE3xwgVqGv18hIaGcjgcLpe7b9++d66QlZXVo0cPANDV1U1MTJRYYJ07dwaARn9uhiypdyIsK8OAAFRWRgBs3hwDArC8/NXS/PzXEt69e8jjiY65xsb4SXWyq1fRwUFURo8er8Y8xsaetbCwYLlnwIAB79lLqqqq3qiTPWu4m/49f/78jfpo7WbYhw8fGhsbh4SEsHR4/fr1AQMGsJgtLCxiY881VBiIiCdOoKmp6JNydn7tQSl8PubnvzYZGIgtWojypb//qxsLkkZSXY3m5vj336LJv/7CHj2wpoYS4Wflp59+UlVVPXz48NuLrl+/3qZNGwAwNTV99OiRJKMaM2YMAGzbtk2SG23a6pEIhUIMDxddMcPhII+HddwbTp1Cc3PRAdrREetcJ2MtfHJyCICamhgYKOrYevwYfX2xX78tANC+fXtxmnk/VieTl5dvqDpZXeqjc+fOfTtVR0VFdejQAQD69bvt4oIPHtQnCkREvHMHhw0TfcJduuC7fsbvwLpbuVzW1oxbtkjmihfZdecODhyIpqZoaooDB+Ldu4iImzfj3Lmv1pk8GXfskFaABBHZ4POqqqr9+/f//vvvx48fFwgEoaGh8fHxOjo6AwcOLBSPeJIUdiu4GTNmSHi7TdinJsIrV9DOTnSotbLCuLiPe3lVFW7YgBoaCHCu58wZM7Cg4MMvOnUKAVBBAf39sagIEbGkBBcuFFVHW7asXrdu88deNVh7TI2JiUlMTMzHvZGXTp8+XZf66NvJkl2uVFFRsW7dNtaEqayMCxdiScmnhFFQgDNmYGFPRwRADQ3csOGjb3+UmIi2tq9auV+OvyWNJS8PG7uTmNSbs7Ozl5dXYGDg119//fTp071792ZlZd25c0diw+5qO378OADY2dlJftNN1UcnwqysLP///U+ooYEAqKeH27d/+kMm8/Nx2rRe3cpYDW/jxg8ftFesEN2rgVVH9fVfVUfrc5miuE7Gxkzfv3+/7q99/Pixr68vey2rj37wJWw4D+uKrz2cp3aVTE/v46pkVVW4cSNqaiIAjut2FadNe63x86MIhbh9O+rpIUCBhsaU//2vcZ9cKuPYOQf5jOXn5ysqKta+lGL//v3Z2dnSiqegoIDD4aiqqtZQm00D+YhfYEVFxcqVK9mzjMMcHfG77z6x2vK627dx6NBXzXhHjnz4JW9UWs6fr38UyOfzAwMD2c36FBQU/P39i4uL3/+S2nexad68+cfexeatyuhJNj8x8bXKdl3e3enTaGEhesnAgQ10K5KSEvzuu5mOjgCgqqq6cuVKukdPo6BE+NkTCATt27f38/O7y5qvEZ2cnM43yHHnUxkYGACARK+/atLq+gt8o870oAE6st4oHzt2fDUK9P59zMnBvXtfVYkuX8ZHj+pVZ6qL7OxsPz8/dgc/PT098VWJb2B3sdHX13/jLjafQHzJf79+t52d8eZNVv6b9d1Hj/DRIzx48NULz53DnBxR/yj73Nq3xzpURz/OJ9R3ycehRPhfkJmZOWfOnLZt244cObKsrEzqiZDdCk7qd8tqMj78C2yoXrQPqqzEVatEQxeVlHDrVgTAjRtFSydMwE2bREvr04tWF1euXBHf5c/Kyiru9R7QxMRE8V1srK2tL9S7F00gEPz2207W2KyggHPmvKMHtEUL3LQJATAiQvSqoUNx61ZUUhItXbUKKyvrGci/qmMPKPkUlAj/O/h8vp2dXXBwsNQT4Q8//AAA8964kyX5VO/7BTb4uMq6yM5GPz8cPBiPHUN7ezQwQNYUP2EC7tiB48djw4yr/BBW52PtD6zO9+jRo6ysLHF9sfZdbBpEQcGrMbEaGm+OiWVvf9Ag7NABWZPt0KF47BgOG4Y8HjbmY8hE6jImlnwKSoSfPaFQKL7vqKur659//in1RHjo0CEAcHR0lGIMTcm//gKTk5PV1dXZlXb+/v7Pnz+XZFhVVXjsGI4cievXo5cX4stEKOEnQJaWli5atIj1AiorK4v/WLRo0cfekLeOrl1791WSVVW4Ywf6+eF33+HMmYgvE6GEP5Dnz5+Lz43U1dWT6cEI9UeJ8DO2b9++BQsWpKend+rUadCgQT179hwwYEBpaemwYcMuXrwoxcBycnLYULu6XCpGPuhff4HV1dXm5uaNd++VD2KJsKoKzc3x2DFRIpSKzMxMX19fFRUVFRWVxugffVtYmOj6TADk8UTDP1kiLC1FIyO8elWUCKWC3TfH1NRUAi0ETR8lws/V+vXrWfPPyZMnhUJhVlZW7Wcz8fn8OXPmZIof3C5xenp6ACCBw5Es+EDTqMTieBtLhIgYF4empjhunJQvLGY9ZBLbXHk5rl6NLVpghw6iu2SzRIiIkZHYrx8OGSK1RMhId/doOigRfn6qq6vZQ3rf88C/b7/9FgC6d+/+weHljYTdxDFCPGqA1AMX/h27/7rU2duDjQ0cPCjtOCSrWTNYsABu3YI9e0BZ+bVFo0aBigpcuCClyF76THYPQhpWZWWlt7f3hg0blJSUdu3atWzZsneutnDhQjMzs+TkZDc3N4FAINkYAQB69uwJANeuXZP8ppue9yXCz8e6daCoKO0gpKFdO+jd+x3zN26EqiqJR0NIU1dQUODs7BweHq6urn7s2DFvb+9/W1NNTS06OlpHRyc2Nnbq1KmSDJKxtLQEgKtXr0p+000PB182+n1uqqqAzwdVVdFkaSkoKkozHbLH6krx4xIIoLoamjcXTZaUgLIyKChIKxzSMJ727g0AegkJ0g6EwP3794cNG3bv3j0jI6MjR46YmJh88CVJSUn9+/cvKytbsWLFwoULJRCkWEZGhqGhoba2dm5uriS32yR9vonwcyP1REiaJNqvPhPx8fEjR47Mz8+3sbGJiorS0dGp4wsPHz48cuRIoVAYEhIivvuEZGhpaeXn52dmZrZr106S2216/htNo4QQ0nj27t3r5OSUn5/v5uYWGxtb9ywIAMOHD9+0aRMiTpo06dSpU40X5NvY0xCpm7D+KBESQmRaUFCQl5dXZWWlv79/ZGRkc3H3Q51Nnjx51qxZVVVVHh4eN2/ebIwg34mNl6FuwvqjREgIkVE1NTVTp06dNWsWh8MJCgoKCgpiFw5+gl9++WXUqFHFxcUjRoyQWKcdGy9DNcL6o0RICJFFpaWlI0eO/OOPP5SVlffs2cMuHPxkXC53165dffv2ffTo0fDhw8vKyhoqzvd4u0a4devW+Ph46nL+WDRYpq5oUEOTsmED/Pkn1NQAAIwfD99+CxyOVAKR8n6FCGvWwM6dLBSYPBlmzJBOJJKVnZ09YsSIq1evampqHjx4UHyT/XrKz8+3tbW9d+/e8OHDDx48KCcn1yDF/htEVFdXLy4uzsnJ0dHRKSkp0dHRqaioaNeu3ahRo3g8np2dHUdKO/Z/jJQu5P/vKe/SpbxLF2lHQRrCoUPYrZvobu7PnmHPnii9x9lI+We4ezf26oV5eYiIT55g16546JDUgpGUlJQU9gw1Y2PjtLS0hi38/v37WlpaADB58uSGLfmdHBwcAGD//v2ImJeXN3/+fPbWmM6dOy9ZsiQ1NVUCkfynUSKsM7oVVpMxfDju2vVq8sAB7NdPWrFIORHa22NU1KvJkBB0cZFaMBKRlJTEnr/t4ODQSLcJjIuLYzfoX79+fWOUX5u9vT0AmJiYPHnyhM0RCoXnz5+fMWOGrq6uOCOam5uvW7eN7kv6b6iPkMiehw+hS5dXk127woMHor/v3pVKRJImfpsPH0Lty8ZrfxRNlKmpqbm5uaen57FjxxrpNoH29vY7duzgcrlz587dt29fY2xCbPTo0QBw584dfX19e3v7oKCg3NxcOzu7DRs2ZGVlxcXF+fv7a2trp6SkREdrdOwI3brBsmXw8GGjBvUfJO1M/N9BNcImo3dvPH361WRiInbtioh45QpyuRJ64iUiIla9vFGe5J7jkZmJvr7I5eKlS4iIJiaYlPRq6cmT2LevhCKRnpKSEgk8vWjVqlUA0KxZMzZ6pfGsX7/ew8OjWbNmbF+Sl5cfPHjwtm3bCgsL2QoCgeDQoUOzZuW1bCk6jHG52K8fbtqE9FBRho7sdUaJsMnw98fvvns1+dNPOGECIuKOHaiiggCorIyLF2PjPHJS7NSpU+bm5oqKisrKyl26dDl8+HCjbg5LS3HxYlRWRgBUURE9zGX8eFy16tU68+fj7NmNG4YsmTx5MgCMGjVKAtsqLy+Piori8XiKL29EqaSk5OLiEhIS8uLFC7ZORQVGRiKPh82aiY5ncnI4aBBu3YovV5FRdGSvM0qETcbTp9i+PS5dijEx+NNP2LYtpqeLFmVloZ8fcrkIgG3a4JYtWFPT4Nu/d++eq6srO1q1b9++ffv27G9XV9d79+41+OZQKMTwcNEjLjkc5PHw0SPRoocPsU0bXLECY2JwyRLU18daj9wj9VRVVbV27drKykpJbrSwsDAkJMTFxUXh5Z2IHRwyXFwwJOTVqV15OUZFIY+HioqiA1tDjxn6j6Eje51RImxKcnNxwwacORMDAzErC0tL0dkZjx4VLU1MRFtb0TdubY0XLjTUZktLSwMCApSUlABARUUlICCgoqJCIBAEBgaqqakBgIKCgr+/f1FRUUNtEa9cQTs70XuxssK4ONH8o0fR2RlLS/HJE1y/HmfOxN9+w2fPGmy7RNpycnJ+++03JycX1swBgGpq+OWXeOQICgSidZ4/x61bcdo0qQb6GaAje51RImzCfv5Z9P26uIjOjWtq8O+/UU9P1KMyfrzocotPxW7KzAbycTgcX1/fp0+f1l4hPz/f39+fXXmmqakZGBhYXV1dny1idjaOHy+q3erp4d9/i2q3aWno4iJ6vz//XK9NkP+CzEwMDEQ7O+RwRF+7ujr6+mJUFLK+6fBw7NbtVX/huHGiHmTZQUf2OqNE2IQJBBgYiGpqCIAKCujvj6xOVlqKAQGorIwczlQrK1aB+4TiExIS+vbty9qpevXq9Z7RE0lJSf369WNr9uzZ89y5c5/0bgSBgYFfW1khlyt6O+wp6uztKCmJugkDAlCyrXZEuu7dw59+QjMz0cGMnSDNmoV//ok6OjhunGg1JyeMjZVmnJJHR/Y6o0TY5OXno78/yskhAGpqYmAgsjrZ/ft3Xt5vpUOHDvv27at7kU+ePPH19WV392jbtm1ISEhdxitGRUUZGhqyLbq4uKSLuzDrYN++fR06dGCvvTNjBt6/j4hYU4MhIaijI6rg+vri6/VRIlNSUzEgALt0QQC0s8O//sKpU9HSEk+dQqRESN6HEqGMuHoVHRxEX7elJZ49y2afPn3awsKCJZiBAwcmJye/v5jy8vLVq1erqqqyMfQLFix48TEj88rKyt54eUlJyftfcvv27SFDhrAITUxMjhw5IlqQkIB9+ojekY0NXrxY9zBI05aYiHFx+NdfOH06xsWhiQlWVlIiJO/x4sVrQ4wFgtealSoqUGKXgpHGJhRiaCjq67Nhlufnzs3IyEDEmpqakJAQbW1tAOByub6+vrn/ch1Wfap0tdWxQllQUODv7y8vLw8AGhoagYGB7MLEjIyM83PnirqG9PUxNBQb//o58p/DEiEijh+PK1dSIiTvlJiI3bujtTVaWWH37piYiIgYFCTad5jx43HrVmkFSBpFeTmuXs03MWmlrFy7SldQUDBjxgyWdebNm/fGi65evSru5LO0tDz7skJZHwkJCX369GFl2tjYXHyrSjdv3jx2JfWMGTPYbcPEFUpVRcXKLl1wwQL8UIWSyCxxIszNRX19NDOjREjewOejsfGr+zHu24fGxsjnUyKUEVmPH48dO5bVydq3b79nzx5WJ0tNTeXxeLVvVtnwwz5rYZXRfxt0WlBQwOPx2L2VhULhnj172LWJHA5n7NixWY8fN1QYpEkSJ0JE3LQJASgRkjecPo3W1q/N6dkTY2MpEcqUy5cvv2fYZ+NeCFjL25chvnGxdoMMOiWy5vJlPHBA9HdNDS5fLrGbDH4uKBF+yPbtyOO9NiEi458AACAASURBVMfTE7dvx6AgVFdHExPRPzU1SoRN279dCHjixAlTU1OWe5ydnSXwyJt79+7xeDy2RWNj4/DwcETMy8trvPookRExMThpEsbESDsOiaNE+CHR0ejk9NocR0c8fJhqhLKpuLj422+/ZbdzVFVVZaM6AaBr165HxTemkYijR4927dqVbV0ciaKi4rffflvMrhok5CP9+CMC4Ny50o5D4ugxTB/SqxckJ0NOjmjy6VO4cQOsraUaE5Gali1brlmzhnUQlpaWlpaWKigorF69+vr164MHD5ZkJIMHD75x48aWLVsUFRVZJM7OzteuXVuzZg173h4hH6tnTwCAq1elHYfEyUs7gM+ejg4sXAj9+8O0aYAIv/8OS5aAtra0wyLSxBokt23blpub6+Hh0blzZ6mEIS8v7+fnN2DAgMjISB0dnYkTJ0olDNJkWFkBAFy9CojA4Ug7Ggni4MtnZJP3SU6GixcBAGxtgV1VfesWvHgBLwe1Q1wc6OpCp05Si5AQQuqtbVvIzob796FjR2mHIkGUCAkhhIiMGAGHDkF4OLwcjyUTqI+QEEKICOsmvHZN2nFIFiVCQgghIpaWALI3XoYSISGEEBGrnjixW8Jkub+kHYhEUR8hIYSQWrS0ID8fMjOhXTtphyIhVCMkhBBSi+w1j1IiJIQQUovsXVdPiZAQQkgtrEYoSyNHKRESQgipRfZqhDRYhhBCSC2IoK4OxcWQkwM6OtKORhKoRkgIIaQWDgd69ACQodZRSoSEEEJeZ2kJysqQnS3tOCSEmkYJIYS87sULUFCA/Hxo0wbk5KQdTaOjGiEhhJBaBAKYPRvMzeHrr8HYGObPh6ZeX6LnERJCCKnlt98gNxdu3QJFRSgpAScnCA2FsWOlHVYjohohIYSQWnbtgm+/BUVFAIAWLWD2bNi9W9oxNS5KhIQQQmrJzAQDg1eTRkaQkSG9aCSBEiEhhJBatLTg+fNXk/n5oK0Nz57B0KGwfTsUFUkvssZCo0YJIYTUMm0aqKvDihWiya++gk6dQE0NZswAAFBSgkGDgMcDd3do0UKKYTYgSoSEEEJqyckBe3twdITu3eHCBbh9G86fh6oqiIqCiAg4dgyqqgAAlJXB2Rl4PPDwABUVaQddL5QICSGEvK6sDE6fhvR06NwZHB1FA2eY3FyIiIDQUIiPF11WoaYGbm4p48Z1HTBAXv4/eSUCJUJCCCEf78kTiIyEiAiIj+crKuoqK3O4XBcXFx6PN3To0P9WRqRESAghpB7u3bt35Ij7n3+mpqayGW3atOHxeF5eXn369OFwONKNri4oERJCCGkAqampERERoaGhd+/eZXPat2/v7u7O4/Hs7e2lG9v7USIkhBDSkK5cuRIaGhoeHp6ZmcnmBAUF+fv7Szeq96BESAghpFEkJSXt2LEjIiLi7NmznTp1knY4/4oSISGEkEYkFAq53M/65i2UCAkhhMi0zzpLE0IIIY2NEiEhhBCZRomQEEKITKNESAghRKZRIiSEECLTKBESQgiRaZQICSGEyDRKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiVCQgghMo0SISGEEJlGiZAQQohMo0RICCFEplEiJIQQItMoERJCCJFplAgJIYTINEqEhBBCZBolQkIIITKNEiEhhBCZRomQEEKITKNESAghRKZRIiSEECLTKBESQgiRaZQICSGEyDRKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiXCd0DEiIiIW7duSTuQhnT16tWoqChpR0EIAEBxcXFERMSTJ0+kHch/T3h4+M6dO6UdRUM6cODA1q1bpRsDBxEltrHjx4+fPn3a0dHxiy++qD0/ISFh//791tbWnp6edSmnqqpq//79Fy5cePHihYaGxoABA4YPH87lipJ6YmJiZGQkj8ezsrKq/apffvlFIBB89913HA7n/eXX1NTIy8svX7586dKldX93ly5dio6ODggIUFRUrPur3qOoqGjt2rVeXl4WFhb1L+2bb76JiIh49uxZ/YtqDCkpKbt372Z/KykpaWtrf/HFF506dfrYcsrLy1esWOHu7m5tbd1QsS1btqxfv35OTk4NUtqGDRv09PR4PF6DlPYflZKSYmFhERERUcef/Gdi4cKF70zerq6unp6ex44d27dvn4ODg4+PT+2l8fHxISEhlpaWU6ZMEc/MyMiIjo5+8OBB8+bNLSwshg0b1qJFi7rEMHTo0MLCwkuXLtU97Lt3765YsWLp0qXGxsZ1f9V7FBUV+fv7+/n52dvb1780T0/Pu3fvpqSk1L+oTybRGmF4ePiaNWuWLVv2xvx169atWbNm27ZtdSkkPT29e/fuPj4+N27cKCsru3jx4siRI/v27Zubm8tWsLCwOH78OI/He/HihfhVwcHB8+bNMzAw+GAW/GRpaWl79uyprq7+5BIQsU+fPvfu3WOTZWVle/bsycjIaKAAP2t3795ds2ZNbGxsUlLSyZMnly9fbmJiMnnyZIFA8FHl8Pn8PXv2PHjwoD7BTJw48cCBA+LJsLCwGzdufHJpS5cu3bhxo3jyyJEj8fHx9QmPSEtNTU11dXV1dXV6evrOnTufPXvGJoVCIQAcOXIkODh4yZIlb9QuNm7cGBwcvH37dvGcCxcudO3adfHixcnJyWfOnJkwYUJgYGDjhV1ZWZmenl5ZWVmfQoyNjS9cuMD+rqmpSU9PLy0tbYjoPg8oQU5OTqqqqhwOJz09XTyzpKSkefPmqqqqJiYmHyyhqqqqe/fuurq6ycnJ4pmxsbEtWrRwdHQUz0lLS1NVVfX19WWT9+7dU1VVHTduXB3jZMls+fLldVy/obDq2vXr1xuj8GnTpmlpaTVGyQ0iIiICAC5fvswmq6urAwMDuVzujBkzJB+MoaHhn3/+2VClOTk5ff/99w1VWtPATiwiIiKkHcgn2rt3LwAkJCTUnunq6qqpqQkAly5dEs8sKytTVVXV1NTU1tYWz7SysurUqVNBQQGbLC0tff78eR03PWTIkN69e9f7HXyciooKDodz5syZxijcw8PDzMysMUquO3lJJt309HQej3fgwIGwsLAFCxawmQcPHuTz+VOmTNm2bRsiimtsOTk5YWFhT58+bd26tZeXV/v27QEgOjo6OTl5z549tVsLBwwYsHTp0vnz51+4cMHOzg4AOnXqtG7duilTpgwZMoTH4/n4+Kirq2/YsOH94Z09e/bs2bNVVVXt2rV7Y9HVq1cPHz5cVlbWpUuXsWPHKisrV1dXb9u2zc3N7dq1a2fPnrW3t9fX1z937pyfn5+8vHx4eHjLli2HDBkiLiE6OlpRUXHw4MFlZWWHDx++ffs2AJibm48cOVJOTg4AoqKirl27BgCRkZEJCQkGBga2trY7d+784osvjI2Nb9++HRcXVzskS0vLXr16AUBZWVlYWFhaWpqKioqbm5u5ubl4nadPn+7bty8zM1NLS+vu3bsf811JmZyc3MyZM2/evLl58+YFCxa0bdsWAF68eBEWFvbgwQNVVVUPD4+uXbsCwOnTp+Xl5Y2MjHbs2AEAM2fO3LFjx8CBA7t27Xrp0qWbN29OnDhR3Gx+48aNy5cvT5w4kb0wKSmptLRUX1/fy8urZcuWAHDx4sWUlJSnT5+eO3dOKBQ2b9583LhxwcHB7NPOzMyMiYmpHWenTp0GDhwIAJcuXYqPj8/Pz2/btq2Hh4euri4ApKamXrhw4c6dOwoKCsHBwQAwceLEyMhITU1NZ2dnVsLjx4/37t377NkzTU1NDw+PDh06AEBubu7BgwcnTpx4+fLl48ePKygoDBs2zNLS8p2f1Y0bNx48eODk5LRt27bCwsKpU6fq6urW1NRERUVduXKFw+EMHDiQteveuXPn2rVro0eP3rdv37Vr19TV1ceOHduuXbvU1NT9+/cLhcKBAwf269dPXHJeXl54eHhmZmaLFi1cXV3ZrlVTU7Nt27bu3bvb2NiI1wwJCTE1NWU75KVLl44dO1ZRUWFubj569GgFBQW2Tk1NzeHDhxMTE+Xk5MTfSFOSnp4+dOjQixcvhoWF9e7dm808dOhQeXn5jBkzVq1aVVZWpqKigog3btzw9fXV0NBg66ioqKioqLyn5Li4uAMHDmRnZ7dt2/bhw4fq6uriRcnJybt27Xr8+HG7du2++uor9h0FBgZaW1uXl5fv2bPHyMjoyy+/DA4Onjp1qr6+/o4dO3Jzc+fPny8uITw8PDs7e9asWc+ePQsLC0tNTeXz+SYmJl9//TWLMCQkhJ2hBgcHx8TEGBkZeXt7r1y5ksfj9ezZ89KlS7WbTwDAyclp0KBBAPD48eOtW7feu3evVatWPB6P/VKYS5cu7d27NysrS09P77M4NEks5VZXVysoKKxaterLL7+0tLQUzx8xYoSTkxM7ij19+pTNvHDhQsuWLQ0MDHg8noGBgZqa2p07dxBx2rRpAPDixYs3Ck9LS4O36nCjRo1SU1Pz9fWVk5OLi4t7T2wCgcDDw4PD4djZ2Xl4eLDuJXFpq1ev5nA4vXv3HjVqlIqKSq9evaqrq8vLywHA0dFRXV3dwsLixx9//O233wCgpKQEEf38/DQ0NPh8PiuBnRWuW7cOEe3t7Tt27Dhq1KgRI0YoKSkNHjxYKBQi4uzZs1l279Onj7Oz85IlS9LT0wFgz549iBgZGWn1koGBAQAEBQUh4uPHj42MjDQ0NEaNGtW9e3c5Obno6Gi20YMHDzZv3rxdu3ajRo1ycnJSV1f/D9UImePHjwPAjh07EPH+/fvt2rXT0tJi548KCgonT55ERB8fHwsLC21tbQsLCwsLi7y8PAAIDg5GxOjoaAA4e/asuEBXV9cBAwYg4pIlSzQ0NIYPH87j8XR0dPT09PLy8hBx8+bNrNuja9euzs7Oo0ePRkRlZWXW3nXhwgXxt8C6W+bPn4+I27dvV1FRGTx4sJeXl6GhoaqqampqKiIePHjQ2dmZy+UaGho6Ozs7OztXVlZ269bN29ubxXP48GFlZeUOHTp4eHh06tRJUVFx7969iMjaTj08PNq3bz969Ohu3brJy8vXfiO1rV27VktLy9zcvEuXLh06dMjKyqqsrBw4cKCiouKIESPY0YftzMHBwc2aNXN0dLSwsBg9erSOjo6Wltbq1avZp9q7d28OhxMSEsKKTUxM1NDQ0NPTGzVqlIWFBZfL3bBhA1vUp08fOzs7cQCsd+f48eOI+P333wOAvb29m5ubsrKyuJ2moKDAxsZGUVHR2dnZ3d3dxMQEmlyNsEWLFkuWLJk/f76enl51dTWbOWrUKAcHB7Yrsr0CEfX19bW1tZOSkuqyrUWLFgGAg4PDnDlzvL29NTU1xTXCvXv3ysvL29razp4929LSslmzZjdu3EBECwsLc3NzXV1dFxeX6dOnnzlzBgAuXLiAiL/99huHw3n48CErQSgU6uvrz507FxHHjBljY2MzY8aMmTNntm3b1tDQsLS0FBE3btzo4uICACNGjPDz81u7dm1WVhYAbN++HRGPHTvGe6l79+4AwHahpKQkNTU1ExOT2bNnOzk5cTicyMhIttEVK1ZwOBxbW9s5c+aMGzdOW1tb6jVCySVC1tcVGhrKxi7eunULEQsLC5WUlDZv3syqOxcvXkREoVDYuXPnnj17lpeXI2JRUZG2tvb48eMRcfjw4Zqamm8XzufzuVzupEmTas8sKCho06YNALCj2HusXr0aAHbv3s0mazeN3r59m8vlzps3jy06ceIEAISHh7NEaGdnx/YVRKydCNmed/jwYbYoPDycy+VmZmYiYn5+vni7LP1fuXKFTbJfl7hptHYiFBMIBDY2NlZWVgKBABE9PT1bt2795MkTFratra2FhQUi5ubmqqioODs7i8P7bzWNMqy7lH0RQ4cObdOmzbNnzxCxqqqqR48effv2RUQfHx9FRUX2I0fE2olQIBC0bt166tSpbFFxcbGysvKWLVsQ8cWLF+LTlIyMDA6Hw05TEPHx48cAULtpVJwIxYRC4YgRI4yMjIqLixGxoqJC/DmXlJRoaGiIN4qIioqKtZtGxYmwoqJCS0vLwcGhoqICEfl8/uDBg9XU1IqLi1kiHDt2bGVlJSJWVlbq6uqK0+cb1q5dK37LzJo1a7hc7rlz59jk9OnTVVRUCgoKWK30p59+YvOTk5MBoGfPnmyfFAqFNjY2NjY2bCnLrKzJTigUTpw4UV5e/sGDB/jyYCru4Fi6dKm2tnZVVRUbwSEun+3Px44dQ8Qvv/xSSUlJ3GbY9JpG8/PzAWDr1q0JCQkAwFoRX7x40axZsw0bNty8ebP2AeGff/5RUFDgcrkDBgzYsWMHO9C90/nz5wFAfPzBWk2jfD5fS0tryJAh7Ey6rKxMR0eH9QdZWFgYGRmJm1trJ8Jnz56xCknt8tkhqKqqSrwVtgeKU9epU6fEbwoRaydCsZKSkk6dOonP7O3s7IyNjcvKythSW1tbVv+5evUqh8OZNm0aWw0/j6ZRyTVQsMO6kZHRF198oaamFh4eDgD79++vqqoaOXKkoaGheJ3MzMy0tDRnZ+dbt24lJSXdv3+/e/fuLFPW1NSIW1pqk5eXl5OTe2OgSm5ublFREQDgh0bG7tq1q2/fvt7e3m8vOnPmjFAo7NevX1JSUlJSUqtWrZo3by5upXRxcXlnm4aDg4OhoWFYWBibDA8Pd3BwYC2urBehtLT0zp07SkpK4nddR0uWLElNTd2zZw/7HE6fPm1nZ5eTk5OUlHT9+nVLS8sbN24UFxcfOHCgrKxs1apV729y+cyxz6empkYoFJ45c8bBweHx48dJSUnJyck9e/ZMSEhgQ2n09fVtbW3ffrmCgoKnp2dERATbMQ4cOFBTU+Ph4QEALVq0UFRU5PP5jx49ysvLa9Wq1cOHD+se2KZNm2JiYnbv3s0aVJWVlVVUVKqqqp48eXL37t02bdrUpbSUlJS8vLxZs2YpKysDgKKi4rx584qLixMTE9kKLHmwz8HExCQzM/M9pY0ZM0b89+nTpzt27Ni8eXO203bt2rWsrIw1vAPA+PHj2R9mZmZcLnfo0KFsn+RwOBYWFmwrz549S0lJmTx5MmuF43A43333XXV1dWxsLAB4eXnJycmxcxcAiIiIGDNmjLy8fGxsLIfD6du3L9tumzZt5OTkzp8/z+fzIyIixo0bJ24wbHrEx7devXqJf/sHDhzg8/mjRo0yNDRkpw5s5TFjxty+fXvRokWPHz/+8ssvjY2N/23MZGRkpJycXEBAwNuL0tLS8vLyOnbsuHfv3oiIiMOHD7dv3/7KlStsaY8ePWq3oIppaWk5OzvXPjQZGxuzAfby8vJCofDOnTtnz55lYw/ZSWEdffPNN0VFRdu3b+dwONXV1ZcvX+7atevhw4cjIiIiIiLYwA6BQLBv3z5EXL58eeONW/wEkusjfPToEQAYGRkpKSm5uLiEhoYGBASEhYX1799fT09PKBQqKSmxddj/O3fuZKddDDsi6OjoPHv2TCAQvHGJQnZ2dlVVFeuYYQQCgY+Pj46OzvDhw1etWjVo0CAHB4d/iy09Pf2NEc+1w+ZwOLNnzxbPYR0w73+zHA5n9OjRmzdv/uOPPxAxJibm119/ZYuOHj363XffJScnKykpsTdV94GRcXFx69atCw4O7ty5MwC8ePHi+fPn58+fHz16tHidDh065Ofnp6enczgcMzOzOpb8eWKtCHp6erm5uRUVFSdPnrx8+bJ4qaGhITsNfw9vb+8//vjj1KlTgwcPDg8PHzJkCDvo379/f9asWTExMRwOp2XLlsXFxVVVVXWM6tatW99+++3ixYv79u3L5uTn58+cOXPv3r1VVVWtWrUqKSnR1tb+YDnssMgauhkjIyM2v1u3bm+sLC8vX/cIHz169PTp0zf2itqDqBkul/tGXx07FMLL3yA7PWVqH8rFB9P58+cnJyffvn2bDfl+9OgRl8v9+uuvxa8yMDAQCATs66vde930iBMhh8Px8PAICQnZsGFDWFiYra0t6+HW0tJinyrTsWPHH3744Ycffjhw4ICvr+8333xz7ty5t4t98uRJmzZtVFVV317ETllOnjxZ+1IKHR2dD4bq7e3t6+t769YtExOTyMhI8fcVHR09c+bM7OxsPT09NqfuY+AjIyN37tx58OBBdhDOycmpqqpKTk7Ozs4Wr2NpaVlYWPjkyRMNDY3WrVvXsWTJkFwiTE9PV1FR0dLSAgBPT8/du3efOnXq1KlTQUFBAMDlcvX19dmOwg5V69ev9/LyeqOQPn36hISEJCYmskExYhcvXgQA8YEJANjQZFZhSkxM9PHxSU5OFvdOv0FdXf3fxhZraGgg4vXr19+4yqeiouL979fb2/vnn3+OiYkRCASsDxIA7t+/7+bmNnbs2CNHjrRp0+bu3buss6QuioqKxo0b5+rqysZ6AICqqqqysvLXX3+9atWqN1Zu1aoVIvL5fFbb+I9iTTr9+vXT0NDgcrkzZ85cvHjxR5Vgb2/PTs979+594sQJ8RD2YcOGtWzZ8vr162ZmZhwOhx2q6oLP5/v4+JiZmS1cuFA8c9y4cXfu3ImNje3du7ecnNx7TrlqY/t5cXGxeA5rwKj/MUJTU7N9+/asGb+2P//8s+4liONhWCOwODYfHx9fX9+0tLTw8PAOHTqwqp6mpqacnFxaWhob/yVWUlICAPUcvv+Ze/Tokby8PGv18fDw+OWXXyIjI0+cOLFmzRq2gpGR0Tvbftzc3Nzc3Pbt2/fOYrW1tdmnJyZu32IpZ9WqVe7u7h8Vqru7u6qqanh4uKOjY1ZWFmtLyM3N5fF43t7eGzZsUFVVLSkpYa0ddfHkyRM/P79vvvlmxIgRbI6WlpacnNykSZPevhRbW1u7rKxMKBSKT8I+2GInAZJrGs3IyBCfYA4ePLhFixb/+9//hEKh+FsU7ygdO3ZUVVU9dOjQ24WMGTNGXV39u+++q/2jKi4uXrZsmaGh4bBhw9icc+fO/frrr0uWLOnfv7+8vPzu3buLi4v9/Pz+LTYTE5Nz586JT39qX+bFun8PHz78se+3e/fuZmZmYWFhtSsily5d4vP5CxcuZJ2Xb+ziLGn92/Fi6tSpNTU1tY9lXC7XzMzsyJEj7Cy+NvGISjZZU1PzUVfgfg6Sk5PXrl07ZMiQbt26KSkpdenS5RO+BQ6HM2bMmMjISNaYzH6oWVlZ9+7dmzx5srm5OYfD4fP5tSvl7Fvg8/nvLHDhwoX379/fvXu3uIkeEc+dOzdmzBhbW1s5OTlEfOP6KmVl5Xd+p2ZmZnJycrXfVHR0NIfDYbtcfXTv3j0hIYF1l34aAwODVq1avREbAPTo0YNNuru7q6iohIWFRUREjB07ljVzWVhYCASCtxNwixYt2rZtK94b4fWfWNPw6NGjdu3aycvLA0CfPn3at28/Y8aMqqoqdgYMryfC2od+RLx///6/nYqZmZkVFRWJ+2Ly8/PFjSJdunRp0aJFaGjox4aqoqIyYsSI0NDQ8PBwa2vrLl26AMDNmzf5fP6ECRNY7fPp06dvvAQA3nnhoFAo/PLLL3V0dH7++WfxTCUlJXNz8717977deGZmZsbn89k4OAAoLi4WX54oRRKtEbKWHwBo1qzZ8OHDQ0NDHRwcxNVwQ0ND1gOhrKy8cOHCRYsWtWzZcujQoWVlZbGxsXZ2dr6+vq1atfr7779Hjx5tY2Pj5+enq6ubkZGxefPmZ8+excTEsGNTUVGRr69vz5492WgrAOjYsWNQUNDEiRO3b9/+1VdfvR3b7NmzXVxcPD09J06cmJ2d/dNPP7EdGgAGDRrUr1+/KVOmZGdnm5qaPnny5ODBgxs3bqxL25e3tzcbOPDXX3+xOSYmJhwOJzAwkN0QYOXKlbXX7969u5KS0k8//eTt7d2iRYvaDZs7duwIDQ0dN25cZGQkm2NgYDB48OCAgICRI0e6u7uPHz+ey+VeunRJSUlp+fLlQ4YMMTU1nTJlSn5+vq6u7l9//ZWamlr3Uzxp2bFjx+nTpwUCwe3bt/ft22dsbCy+99KyZcvGjBkzevRob29voVAYHx+vqanJxii+n4+Pz+rVq5cuXerq6sp+5Nra2pqamjt37jQxMcnOzl6/fn3tapmWlpahoeEff/yhqalZUVExYcIE8aJTp06tX79+2LBhsbGxbF9t3br1qFGjTExM2ADR8vLy4ODgmzdv1r4IoVevXmFhYZaWlsXFxd988414vo6OzpQpU4KCghQUFOzs7K5cubJy5cqvvvrKyMgoJyenPh/j3Llzd+/e/cUXX8yfP79ly5apqakpKSm7du2qewny8vKLFi2aP3/+lClThg8fnpaWtnz5ckdHx/79+7MVVFRUXF1dg4KCCgoKxN2T7u7ulpaWvr6+AQEBxsbGGRkZUVFRISEhrVu3nj179rx586ZOneri4nLz5k1xPanJqH18Y62jgYGBffv2Zdd9AYChoSE7+r948aJ79+6urq7W1tZcLvfAgQOXLl16+zYjjI+Pz7p16zw8PObMmaOsrPzHH3+IFzVv3nzFihX+/v7V1dXOzs5lZWUnTpzw8PB4zxl/7WL/+eefJ0+eLF++nM3p1q2biorK8uXLv/7667S0tODgYNZxI17aunXrpUuX3rlzh3314kWBgYGxsbHjxo0TX5/WuXNnd3f3n3/+efjw4f379/fy8mI/2LZt2/7666+enp6rV6/29vaeM2dOy5Yt//zzz3876ZQoiQ3LcXV1FQ9VQsQjR45YWVmJx2oj4tatW9k4QCY4OLhXr15aWlqmpqaTJk26du2aeNG1a9e8vLzatm3bvHlzfX39CRMmpKWliZeuWLHCxsZGaqrq0gAAA0FJREFUPD5YbOrUqf379/+3C1fZgUxPT69Xr15RUVE+Pj7icYMvXryYM2dO586ddXR0rKysli1b9uzZs8rKSisrq7///ltcQmhoaI8ePWoPAEtPT7eysurduzcbSsps3rzZwMBAWVnZ3t4+KSlp8ODBMTEx4qWRkZHW1tadO3desmRJdnZ2jx49jh49iogBAQFWr5s9ezZ7ydGjRwcOHKijo9OpU6dRo0axQXqI+PTp00mTJnXt2pWdnIaGhjo7O7/zvX8OTp48KX5rNjY2I0eO3LRpExtOKRYVFdWvXz9tbe3OnTuPHj369OnTiLh48WI3NzfxOoWFhT169GAd8mJubm5WVlZsfD9z7tw5S0tLZWXlLl267N69e9GiRT/++KN46ZUrVxwdHdlVDUKhsE+fPr///jsibt68+Y1vgY3kTE1NdXBwUFZWNjQ0/PXXXzdt2jRlyhRxaQ8ePHB1de3QoYOzs3NRUZGnp+fChQvZourq6rVr1/bo0UNDQ8PCwmLFihVs5N6NGzesrKzi4+PFhUydOnXixInv/Oh27txpZWUlHp7H3Lx508PDo02bNvr6+s7Ozn///XdNTc3+/futrKxyc3PFq9nY2GzatEk8uXLlykGDBoknt27damNj07p1axMTk/nz54tHxjInTpywsrKq/eEjYkFBwfTp0zt27Kirq2tjY7Ny5crCwkK26Pfff7ezs9PT0xswYEBCQoKdnR27AOa/KD4+fsSIEbUPO/PmzRNfXoKIV69e5fF4YWFh4jkxMTE8Ho/P5xcVFa1evbpv375aWlqqqqo9e/YMDg4WD6F8W05Ozrx58wYOHDhgwICtW7eGh4eL9x9EjIqKGjlypJWV1dChQ3/88Uc2gHzevHlr164Vr3Pz5s0RI0bcvn1bPEcgEHh7e/N4PLY+c+bMGScnJ2NjYzc3t9TU1O+//z4qKkq89MqVK56env369Vu5cmVBQcGIESNOnTqFiBs2bOC9TrzphIQEHx8fa2trJyenefPmsSsFEDEvL2/BggWOjo4ODg6///77wYMH58yZU/cPvzFI9F6jhBBCyOemCd7fgRBCCKk7SoSEEEJkGiVCQgghMo0SISGEEJn2f9LEyEI8lnPbAAAAtXpUWHRyZGtpdFBLTCByZGtpdCAyMDIyLjA5LjEAAHice79v7T0GIOBlgABGIOYCYk4gbmBkY0gAiTFzMGgAaWYmdgjNzMaQAaahCpgZOSAKGeESMJUwnTCTuIE2MDIpMDFrMDGzMLCwMrCwMbCxazCxcyhwcDKIMIg3QR0BBlwP3ZYdmDVTch+IE5RVCWLbg9gP3dTg4kA1+2HiIAATBwFj48v7oWrskfQ6IJnpgBBfBhcXAwC5pyuKulnfWQAAAQp6VFh0TU9MIHJka2l0IDIwMjIuMDkuMQAAeJyNUkFuxCAMvPOK+UCQDQvBhx42yXZVVUukNu0feu//VaNV1s1ho0As2WQY22Mc2vqY3n9+8Vhhcg6gnU9E8B2JyN3QHAyX61vFuJyH9WScv+ryCSaIXtG9hZ6X+baeMEZ0ybNkTgEdeU5KrVc80d1ZkQEzuuiL9Mwn/Z/pCTCiogs+SaE+71KeWnL2QYRi2aNMCnyc7jFmLdKA7PMzYK+MRzIXbeZQL6KZj6ijQxlxSPBLnTajug9vmOtkw2s72IQ0QLQ5aNA0fr2+BJOc1ZIJy2rZ5GthbyIFtbJlKIopEOubm22q/l9ji9e3qb77A9oDjei/gQhpAAAAcXpUWHRTTUlMRVMgcmRraXQgMjAyMi4wOS4xAAB4nF2NQQrAIAwEv9Kjgi6JrdZQPOWuH/LxDZQW7CVMdkhWR2+qbnhtfeg2XcxgKZxDJNi84o4qJ3MgFNsSslSqn2UkEToeS29swCgGi/7dLo/XVj9vdX8e8lB+vk4AAACIelRYdHJka2l0UEtMMSByZGtpdCAyMDIyLjA5LjEAAHice79v7T0GIOBlgABGIGYDYlYgbmBkY0gA0syMHGCaEcjPAPFBAhogBhOGADfQBCZmDiYmBgYWVg4mFgYGEQbxIKjBYMDGgAEO2IPIh27L9jMwOOyHsNUOANlqUHF7JHEHILUMxBYDAK2aEwQhr270AAAAznpUWHRNT0wxIHJka2l0IDIwMjIuMDkuMQAAeJyNUVEKwyAM/fcUuYAS02rrZ1vLGKMKW7c77H/3Z3HD2sJalhiI4fmSFwUku/rL8wWLkRcCAA+Ocw4eFSKKCVIC/Xg6Bxjmrs+VId7DfAMLhl+wb5HdHKdc0TAAKvzYryTjCGKpamX2cBXzSa3IOaxakKgas4OsmVGSMq7Fxh61Nkz5F6NlxoVQHjCOwW+W8F1LH4Mva0lORT2xsrqI1CmKEuIhbZlXp1g3W1One/4rzsUbuBthm1EqyJUAAABYelRYdFNNSUxFUzEgcmRraXQgMjAyMi4wOS4xAAB4nPO3dXbW8Nd0tvVXqNHQNdIztbQwsNAx0LHWNdQzsrQ0MNHRNdAzN9WxNgAJGugY6gHZaFIIXZo1AK59EJiYGOg0AAABFHpUWHRyZGtpdFBLTDIgcmRraXQgMjAyMi4wOS4xAAB4nHu/b+09BiDgZYAARiDmg/IbGNkYEoA0MyMHgwKQZuJTAFEsbAoGIFFMmiEDRDMzsjNogBkcEJqJaAXcjEwMTMwMTCwMTKwgp7CxazCxcyhwcDKwMTBwcWswcfMo8PAycDEwiDCIb4I6Fwz4/v1j3P/GVcsexNGd9G1/Xs9kMNt0i/OBB/w6diD29pjGAxYRy8Div9exHGhYUbYfxF7J33bgrUAgmF21Vt4+U/cyWM22LUwOHw5PAbOvyHs7GKZzOoDYU79McnidcgUsniGRtO/lQj+w3oX+M+1zdy0Fs4V59tnriBseALETz1s6vPAOALPFAP+AQEuIjOWqAAABm3pUWHRNT0wyIHJka2l0IDIwMjIuMDkuMQAAeJx9U8tu3DAMvPsr+AMr8CVRPOSwjzQI0niBZpt/yL3/j5KbbuVFhMg+SPSQImfGC+T6dXr5+AP/F5+WBQC/ed0d3gURl1fIDRwen55XOF72h1vkeP69Xt6AFEgiJ5577P5yfr1FCI6ww1LRzA2wtFaJPTZ4XSOV4Qw7Ks5NpAEVqmwsE6DA2wfspGC1bhoAah3vSsL7/ueD3vCaHWhB5dp6FGb32mxSuCaQC5pQx2zam1Sc9do+S1J3JU5kpxaDTZAWyBiaxZTi7qbep0N1WIGzR46rY/jGHNGvOA+WpJBJpx4JpM2n9ShUAi01vtZks1WXaUH6JxAzmrfrLGhU+wzK0WT05mYqqRW7WuUZUqJNKtr1iuRizVBmTFKqw8VRuHOqylXDABPk43q6M9an1Q7n9TSsRuEiGYai8IoO2+SxDlfksQ3tM5eGwBzSdTj+eHrgoSVFyIdkFHLQRhpKyEYBTnaDtbsqGcogyYZXSspIN/zRFbgdfjtqnm9/ZOyXv7rRvFPuiJiBAAAA1npUWHRTTUlMRVMyIHJka2l0IDIwMjIuMDkuMQAAeJwljktqBDEMBa+SZTe4hf6yMLPyfnoxy5AD5A5z+EgTMNg8Patq38/H3sd+PO993t+v359jn3X21/tQsGTxQeCWImMJUMjkwUDqyWMxoDLPapAzVwPBWUKp/2hWdV0IhhEZo2ZuxPnJmDHSR70mBtkcq3ZkhI6LgFPDOtGpnTCEB2rzEqV4lwCbegGrneyirWAcDRRAi9k88okfnranxejVWfeqlRhCswXSxfC/RTPLvaXIS3ac7z8I1Drprs5uMQAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'CON=CC(O)C=NOC'\n", + "ShowMols(\n", + " [\n", + " der := Chem.MolFromSmiles(smiles),\n", + " orig := remove_derivatization_groups(mol=der),\n", + " add_derivatization_groups(mol=orig)\n", + " ],\n", + " legends=['MeOX derivatized','Derivatization removed','TMS derivatized'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Read the input file\n", + "\n", + "The file is parsed line by line, errors are reported and ignored otherwise. \n", + "\n", + "The result is `mol[]`, a list of pairs (_original SMILES_, _RDKit molecule_)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "#smi_file='NIST_Si_100.txt'\n", + "#smi_file='NIST_Si_all.txt'\n", + "#smi_file='NIST_SMILES.txt'\n", + "smi_file='NIST_195_200.txt'\n", + "with open(smi_file) as f:\n", + " mols = list(filter(lambda p: p[1], [ (smi.rstrip(), Chem.MolFromSmiles(smi)) for smi in f ]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Essential statistics\n", + "\n", + "Count occurrences of (one-),di-,tri-methylsilane, TMS attached to -O, -N, -S, and methoximine. " + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# total 4\n", + "# with SiMe: 1\n", + "# with SiMe2: 1\n", + "# with SiMe3: 1\n", + "# with ONSSi: 1\n", + "# with MeOX: 3\n" + ] + } + ], + "source": [ + "SiMe1=Chem.MolFromSmarts('[Si][CH3]')\n", + "SiMe2=Chem.MolFromSmarts('[Si]([CH3])[CH3]')\n", + "SiMe3=Chem.MolFromSmarts('[Si]([CH3])([CH3])[CH3]')\n", + "ONSSi=Chem.MolFromSmarts('[O,N,S][Si]([CH3])([CH3])[CH3]')\n", + "\n", + "print('# total',len(mols))\n", + "with_sime1 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe1),mols))\n", + "print(\"# with SiMe:\", len(with_sime1))\n", + "with_sime2 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe2),mols))\n", + "print(\"# with SiMe2:\", len(with_sime2))\n", + "with_sime3 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe3),mols))\n", + "print(\"# with SiMe3:\", len(with_sime3))\n", + "with_onssi = list(filter(lambda m: m[1].HasSubstructMatch(ONSSi),mols))\n", + "print(\"# with ONSSi:\", len(with_onssi))\n", + "\n", + "MeOX=Chem.MolFromSmarts('C=NO[CH3]')\n", + "with_meox = list(filter(lambda m: m[1].HasSubstructMatch(MeOX),mols))\n", + "print(\"# with MeOX:\", len(with_meox))\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Inspect whatever from the sorted categories" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAZhUlEQVR4nO3deVwTZ/4H8G/CKQIiHnjhwRFAxQOoF6hYdVVErW1p3SrWdZX+vAARBNQW16rFWguurpV2dUvb11qPVSuLQovSeqwWtVo8MCiCIsghyA1JSJ7fH2NRKVY0yUxCPu+/bDLM84Hqh2cmz8yIGGMEAAAvSyx0AAAA/YYaBQBQC2oUAEAtqFEAALWgRgEA1GIsdAAAHaNQUGEh1dWRnR3Z2gqdBvQAZqMAv7l3j+bNo06dqG9f6t+fOnWiwYNpzx6hY4Guw2wUgIiIcnJozBgqKaE5c2jyZLKyohs3aMcOeucdunKFNm4UOh/oLhGW3wMQEfn40P/+R999R9OmPX6xpoZefZUuXKATJ8jXV7BsoNtwUA9AlJFBZ87Qm28+1aFEZGlJn31GjFFcnEDJQA+gRgGI0tOJiF57rYW3PD2pb1/66SdSqXgOBfoCNQqtxRjbu3dvY2Oj0EG0IDeXiMjJqeV3nZ2pspLKy/lMBHoENQqt9e9//3vWrFkDBw48duyY0Fk0oaLi8Z/r64mI2rVreUsLCyKiujrtZwK9hBqF1urUqZOjo6NUKvXz85s5c2ZOTo7QiV7Ww4cUFUV9+1Je3qNXOnQgerpYm21PRDY2fGQDPYQahdaaPHlyVlZWfHy8tbX14cOH3dzcQkJCqqqqhM71IpRK+uwzcnamTZuopoZOnHj0ev/+RESZmS1/ydWrZG9P1tb85QT9wgBeUGFhYVBQkJGRERF17tw5Pj6+sbFR6FCtcOIEGzSIETEiNm4cu3z58Vu5uczIiI0YwVSq5l916BAjYosW8ZkU9AtqFF7ShQsXfHx8uF/GHh4ep06dEjrRs929ywIDHxWovT1LTGxhm3nzGBH74IOnmlQqZb16sfbtWU4Ob2FB72D5Pbw8xtiBAwciIiLu3LlDRP7+/tu3b+/Tp4/QuZ5QW0ubN9OmTdTQQO3bU3g4RUWRuXkLW1ZVkZ8fnTlDgwfTlClkaUlZWXTwIKlUtGcPzZzJe3TQH0L3OOi92tra2NhYS0tLIrKwsIiMjKyurhY6FGMqFdu3j/XuzYiYSMQCAtidOy1vWVzMKisZY0wmY9u3M29v1rEjMzNj/fqxBQvYjRt8pgZ9hBoFzcjPzw8MDBSJRETUq1evxMRE1e/PM/ImI4ONGvXoKN7Li5050/JmcjmLj2cdOrAVK/jNB20KahQ06dy5c8OHD+cOdIYPH37u3Dm+ExQUsKAgJhYzItajB0tIYEply1smJTGJ5FHVzpzZwodLAK2DGgUNUyqViYmJdnZ2RCQWiwMDA4uKivgYWCZj8fHMyooRMVNTFhzMqqpa3lIqZVOnPipQiYT99798xIO2CzUKWlFdXR0TE2NmZkZElpaWMTExDQ0N2hvu8IEDKheXR8345pssN7fl7R4+ZJGRzNSUETEbGxYby2Qy7aUCA4EaBS3Kzs4OCAjgjvGdnZ337dun8SEuXbo0duxYIjrl68vc3FhKSsvbKZUsMZF17cqImFjMAgNZcbHGw4BhQo2C1qWlpQ0cOJAr0/Hjx1+5ckUjuy0pKWm6CqBLly5fffEFe9ZVACdOsMGDH81VfX2fWngPoDbUKPBBoVAkJCR07tyZiIyNjYOCgkpLS196b3K5PD4+3sbGhohMTEz+aG+tWXgPoB7UKPCnrKwsODiYmz/a2trGx8crFIoX3ckPP/wwYMAAbm47YcKEq1evtrxdTQ2LiWHm5oyIWViwmBhWX6/uNwDQEtQo8O369euTJk3ietDV1fXYsWOt/EKpVOrv7990pjUpKanl7Vq/8B5AE1CjIIwjR444ODhwnejv75/zhxetP3z4MDIykvvc38bGJjY29lmf+589ezZ79uxHR/HDhzP+F66C4UGNgmBkMhl32z0iMjU1DQ4OruQuynxC61ehFhQUBAUFicXigV26KCWSP1p4D6BRqFEQWFP9EVH37t0TEhKUv9Xfjz/+OHjwYG7GOnbs2EuXLrW4h/r6+vXr17dv356IzM3NV61aVVtTw+N3AIYONQo64fz5897e3lxjenl5HTx4sJVX6L/QyQEAbUCNgq5QqVT79u3r3bt30+3HLCwsPvzww7q6uha3f/KjKjc3t5RnLbwH0DI8RAR0hUgkCggIyMrKarqKVCaTpaWlyWSyZluWl5eHhIS4u7unpqZyC6euXLnSVKkAPMNtm0EX5ebmzps37+TJk0Tk4eFx8eJF7nWFQvGvf/1r9erVDx48MDEx+ctf/rJhwwZuVT+AUFCjoLvi4uLCwsKIiPtbmpaWFhoaeu3aNSKaMGFCXFxc0zWmAAJCjYJO4z5lys7OXr169f79+4nI2dl5w4YNTXc8ARAcahR0GlejZmZmMpnM0tJyxYoV0dHR3JlTAB2BGgWdxtWoWCyePXv25s2buXX4ADoFNQo6javRX375ZejQoUJnAWgZahR0Glej+FsKugzrRgEA1IIaBQBQC2oUAEAtqFEAALWgRgEA1IIaBQBQC2oUAEAtqFEAALWgRgEA1IIaBQBQC2oUAEAtqFEAALUYCx0A4I+MGLFF6AgAz4E7PIFOE4mIiPCXFHQZDuoBANSCGgUAUAtqFABALahRAAC1oEYBANSCGgUAUAtqFABALahRAAC1oEYNQlVVVWRkZFZWltBBDEVWVlZkZGRVVZXQQYAPuBjUIKxbt27Lli2ZmZnHjh0TOotBCAsLS0lJUSqVn3zyidBZQOtwMahBKC8vl0gkZWVlSUlJ/v7+Qsd5Afp4MWhSUtL06dM7duyYnZ3duXNnoeOA1uGg3iDY2tquWbOGiEJDQ2UymdBx2jK5XB4eHk5EMTEx6FADgRo1FEuXLh0wYEBOTs727duFztKW/f3vf8/OznZ1dV28eLHQWYAnOKg3IGlpaRMnTrSyssrOzu7WrZvQcVpFvw7qS0pKJBJJZWXl0aNHp0yZInQc4AlmowZkwoQJU6dOra6u/uCDD4TO0jatXr26srLS398fHWpQMBs1LLdu3Ro4cKBCofj555+9vLyEjvN8ejQbvXTp0iuvvGJkZJSZmeni4iJ0HOAPZqOGxcnJaenSpSqVKiQkBL9BNSs0NFSpVAYHB6NDDQ1mowanurpaIpEUFRXt3bv3rbfeEjrOc3z1FRHR3LlC53ievXv3zpo1q2vXrtnZ2R06dBA6DvAKNWqIPv/88/fee8/e3v7GjRsWFhZCx2kuP58UCurZk8zMnnr93j1ijOztBYr1bPX19f3798/Ly/v8888XLlwodBzgGw7qDdGCBQu8vLzy8/N18xobX19ydKRVq5q/PmkS+foKkOe5Pv7447y8vCFDhsyfP1/oLCAAzEYN1JkzZ0aPHm1ubp6VldWnTx+h4zzF0ZFu3yYjIzp/noYOffz6gAHU0EA5OcIla8m9e/dcXV1ra2t//PHHsWPHCh0HBIDZqIHy9vYOCAior69f9ftZnw5wdqaOHen//o9UKqGjPM/KlStra2vffvttdKjBQo0ark8++cTCwmLPnj2nTp0SOktz1tb0/vuUkUEJCUJH+UNnz5799ttv27VrFxsbK3QWEAxq1HDZ29uvWLGCMRYaGqrSvVnf4sXk7k5RUVRYKHSUZ1CpVKGhoYyxiIiIvn37Ch0HBIMaNWjR0dG9e/f+5ZdfEhMThc7SnLExxcVRVRWFhwsd5Rm+/PLLjIyMnj17rly5UugsICTUqEFr167dxo0biSg6OloH7zE8fjzNmkV79lBamtBRfqe6upq7adbHH3/cvn17oeOAkFCjhu6dd97x8fEpLi7+6KOPhM7Sgrg46tCBgoNJoRA6ytPWr19///79kSNH/vnPfxY6CwgMNWroRCLR1q1bxWLxp59+evPmTaHjNNetG61dS1lZtHOn0FGecPv2be6HFh8fL+Iu+wcDhhoF8vDwmDt3rlwuF/AcX2UlRUTQN9+08NbSpTRoEK1bR01nHb75hiIiqLKSz4BPCQsLk8lk77777rBhwwQLAbqDATBWVFRkbW1NRCkpKTwPrVSyL75gdnaMiPXsyeRy5uDAPD2f2ub0aSYSMSLm4MDkctazJyNidnbsiy+YUslzXnb8+HEisrKyKiws5Hts0EmYjQIRkZ2dXXR0NBGFhYU1NjbyNm5GBnl708KFVFxMw4bRgQNkYtLCZt7eNG/eoz+bmFBSEo0ZQ8XFtHAheXnRyZO85SWlUhkaGkpEq1ev7t69O38Dgy4TusdBV8hkMmdnZyLavn07D8Pl57PAwEdzzJ49WWIiU6kevbV0KYuMbL59SQl7+222dOnjV44cYf36MSJGxPz92e3bPKRm27ZtIyIHB4eGhgY+xgN9gBqFxw4ePEhEHTt2fPDggfZGqa1lsbHM0pIRMQsLFhnJqqtfcld1dSw2lllZMSLWrh2LjGRVVRrN+rTy8nLuKXWHDx/W4jCgb1Cj8JQ//elPRLRs2TIt7f/IEda37+MpZG6uBvZ5794zJ7aatXTpUiJ69dVXtbJ30FuoUXjKtWvXjI2NuSdhaHbPFy+y0aMfFaiHBzt5UrO7ZxkZbOTIR/t/5RX2v/9peP/Xrl0zMTHRxk8G9B1qFJpbsmSJZudc9+/fDw+/KhYzItatG9u1S1sfryuVbNcu1q0bI2JiMQsPv3r//n1N7XzSpElEtPTJs7MAjDHUKPxeeXl5p06diOi7775Tc1dyuTw+Pt7a2trFZYi5eWNwMKuo0EjGP1JTw2JimIWF0s3Nq3379jExMfX19Wru8/Dhw9xZ49LSUo2EhLYENQot2Lp1KxE5Ojqq83n04cOHnZycuAUh06dPv3mzXIMJn+vmzfLp06dzozs5OanzoZBMJpNIJES0bds2DSaENgM1Ci1QKBTu7u5EtGnTppf48hs3bvj5+XEV5uLikpycrPGErXT8+HHuG+FOU/z6668vsRPuXqJubm5yuVzjCaENQI1Cy1JTU4loyJAhyhc5kVleXh4ZGWlqasodAsfGxspkMu2FbA2FQpGQkNClSxciMjY2DgoKKikpaf2Xq1SqkSNHElFqaqr2QoJeQ43CM+3du7f1B/VPtpVYLA4MDHyhttK28vLy4OBgY2Pjl+h3hUKh/mliaMPwSDt4jrKysgsXLty9e7ddu3YDBw4cMmQIEeXk5Hz77bfTpk0bNGgQEZ04cWL58uWZmZlENG7cuPj4eO51XSOVSpcvX37s2DEicnFx+fTTT5tOPhARY+zSpUvZ2dmVlZW9evXy9va2sbEhot27dzc0NCxevFiw3KDjhO5x0GmfffaZubn5k39hxowZwxhLTk4mot27d9+6dWvGjBncW46OjocOHRI68vMdOnTI0dGRyzxjxoxbt24xxgoLC5vdrsnU1PT7779njHl4ePTr10/o1KC7UKPwTBcuXBCJRI6Ojt9///2DBw/y8/P/85//JCYmMsYaGhoKCwvr6uoOHTpERJpaV8SbppVY9NuVnVOnTiWiiIiI3Nzc8vLyixcvrlu3jlveVFpaqlMnKEDX4KAenik2NjY6OvrQoUOvvfbaH2y2efPmOXPm6OPtju7fv//NN99EREQ0NjZaWVn179//4sWLQocC/YMb5cEzlZaWElGL981LTU21sLD4+uuviSgiIkIfO5SIunfvHhERQUQ1NTUNDQ3PukOgj4/PgAED+I0G+gQ1Cs/EnStctmzZkSNHlErlk28plcr6+no+70yqVTY2NhKJJDMzc8GCBffv32/2bn19fX19vSDBQC+gRuGZAgIC5s6dW1RUNGPGjN69e69ater27dtCh9KW3bt3d+7cedeuXfb29n5+fgcPHlTo2lP0QFehRuGZxGJxYmLixYsXQ0JClErlRx995Orq+uWXXwqdSyu8vb1zcnJ27do1evTo1NTUN954w8fHRwcfOg06CDUKz+Hh4REfH19QUJCYmGhmZrZo0aKSkhKhQ2mFtbX1/Pnz09PT8/LyJk6cmJGR8eGHHwodCvQAahRaxcjIaO7cuUuWLGloaDh37pzQcbTL3t7+wIEDRkZGP/30k9BZQA+gRuEpcrl8y5YtSUlJLb4rFouJSKVS8RtKANx32uyDtaSkpC1btsjlcoFCgY5CjcJjycnJ7u7u4eHhISEhcrk8JSUlOTm5qTWuXLnyz3/+08zMbMSIEcLm1LjGxsbNmzffuXOH+0+ZTBYVFaVUKn19fZu2kcvlISEh4eHh7u7u3EVcAI8Ivf4fdIJUKuUu4yEiiUSSlJTEGJs2bRoRWVlZubu7u7i4iMViU1PTnTt3sicuBhU6uGacPXtWLBaLRKJ+/foNHTq0Y8eOROTp6VlWVsaeuBg0LS1t4MCB3E9p/PjxV65cETo46ARcxWToKioqYmNj4+Li5HK5jY1NVFTU8uXLuTvdKZXKEydOnD59Ojc3l4gkEklgYGCfPn2IqLCwMD09feTIkQ4ODgJ/AxpSUFBw+PBhqVRaVFTUo0cPHx+f119/nTu0T0lJkcvl3E2gGxsbd+/evXr16gcPHhgbG8+fP3/Dhg3c40LBcAnd4yAYpVKZmJjYtWtX+u3WdsXFxa3/8vT09FOnTmkvHp8KCgpUL/I00bKysqbb7tna2sbHxysUCu3FAx2HGjVQ6enpgwcP5n6V+vr6Xr58+YW+nLsjSf/+/dvADeHlcrmbm9uwYcPu3r37Ql+YlZU1efJk7mfo6up67NgxLSUEHYcaNTh3794NDAzk/vHb29tzd2x6UU2PJ9q6davGE/IsLi6O1Hjw1JEjR5rObPj7++fk5Gg8Ieg41KgBqa2tjYmJ4e4famFhoeat7Y4cOUL6/7DMsrIy7jGo3KdqL0cmkzXdds/U1DQ4OLiyslKDIUHHoUYNgkql2rdvX+/evYlIJBIFBATcuXNH/d1yh7SLFy9Wf1dCWbRoERGNHz9e/V0VFhYGBQVxn0p17949ISHhhR5jBfoLNdr2nT9/ftSoUdxRp5eX1+nTpzW15+vXr5uYmBgZGb3cEzcFd/XqVWNjY2NjYw0uXTp//ry3tzf30/b09NTgTxt0Fmq0LSsoKGiaH/Xo0UMb86Pg4GAiGjdunGZ3y4+JEycSUUhIiGZ3+/u5f15enmaHAJ2CGm2buLN1VlZW2j5bV15ezq2aPHjwoDb2rz0HDhzglis9ePBAG/v//Znouro6bQwEgkONtkE8f3b8j3/8g4gcHBz06FlMMpnMycmJiHbs2KHVgfLz8wMDA0UiERH16tUrMTHxhRaogl5AjbYp169fnzRpElegbm5uKSkpPAza2NjIPU5548aNPAynERs2bODWvfKzbP7JVbpjx469dOkSD4MCb1CjbQR3XY2RkVHTdTWNjY28jX78+HEisrS0LCgo4G3Ql1ZUVMQtTkpNTeVt0N9fM1ZUVMTb6KBVqFG9J5fLExISuBOUJiYmQUFBgizknDlzJhG9++67/A/9oubOnUtEr7/+Ov9DP3z4MDIy0szMjIhsbGxiY2Nfbs0/6BTUqH774Ycfmh5aOWHCBAHvOZSTk2Nubi4SiX7++WehMrTGhQsXuFtVZWdnC5VBKpX6+/tz/9ecnZ3VWfkPugA1qq+ys7MDAgKa/inu27dP6EQsKiqKiEaMGKGzn6KoVCpuUeeqVauEztL8V+DVq1eFTgQvCTWql6KiokxMTIioQ4cO3P3YhU7EGGPV1dU9evQgoq+//lroLC376quviMjOzk5HLtbknjXQoUMH7oRMVFSU0IngZeDu93pJqVQqlcrAwECpVBoWFsZVquAsLS25T8CjoqJqamqEjtNcXV3dmjVriGjTpk3cR0yCMzExCQsLu337dnBwsEqlMoSns7RJuG2zXqqqqrp9+/aQIUOEDtIcY2zEiBEZGRlr1qzRtcdqrlmzZsOGDZ6enhkZGdyVXTrl8uXLDg4OOtLv8EJQo6Bh586dGzVqlJmZ2fXr1/v16yd0nEfu3r3r5uZWX19/8uRJHx8foeNAm6Jzv5NB340YMWL27NkNDQ0rV64UOstjYWFhdXV1c+bMQYeCxmE2CppXUFDg4uJSW1ubnp7+5MM1hXL69OkxY8a0a9cuKyuLu2MIgAZhNgqa17Nnz8jISCIKDQ1t9qh3/qlUqtDQUMZYdHQ0OhS0AbNR0IqGhgY3N7e8vLydO3e+9957AibZuXPnokWL7O3tb9y4YWFhIWASaKtQo6At+/fvf+utt2xtbW/evGlraytIhoqKColEUlpaun///jfffFOQDNDm4aAetCUgIMDX17e8vHz9+vVCZfjb3/5WWlrq4+PzxhtvCJUB2jzMRkGLfv31V09PT5FIdPny5aYLH3lz48aNQYMGKZXKjIwMT09PnkcHw4HZKGjR4MGD//rXvzY2Ni5fvpz/0cPCwhQKxcKFC9GhoFWYjYJ2lZaWSiSSioqK5ORkPz8/3sZNTk729/e3traWSqXdunXjbVwwQJiNgnZ16dLl/fffJ6KQkBCZTMbPoAqFYsWKFUS0du1adChoG2oUtG7ZsmUuLi63bt3asWMHPyNu27ZNKpU6OTktWbKEnxHBkOGgHvhw9OjRqVOn8nOI3XQa4ejRo1OmTNHqWACE2Sjww8/Pb8qUKVVVVWvXrtX2WGvWrKmoqJg4cSI6FPiB2SjwhFt+ZGRklJeXZ2dnp6VRiouL+/btq1QqMzMzXV1dtTQKwJNQo8Cf3bt3e3t7u7i4aHUUqVR65syZ+fPna3UUgCaoUQAAteDcKACAWlCjAABqQY0CAKgFNQoAoBbUKACAWv4fLtN+WpE5rVoAAADHelRYdHJka2l0UEtMIHJka2l0IDIwMjIuMDkuMQAAeJx7v2/tPQYg4GWAAEYg5gFibiBuYGRjSACJMbMxaABpZhYOCM3EDqGZ+RQUgDQLQhkqTUCaG2gbI5MGEyOzAjMLAwsrSCULOwMzBwMHJwMHFwMHN4MIg/gsqKvAgOeh27L9DAwSDhBuhzoDwwV7EAsobo8Q/6AKJOwQ4gr7QezDvTz2D92+gdmFyswOh3t1oHrVgPo27Ieq34+kfj+S+gNI6g/A1IsBAGcDLeyCibsUAAABJXpUWHRNT0wgcmRraXQgMjAyMi4wOS4xAAB4nJVT226DMAx9z1f4B4hscwl52AOXapq2grSy/kPf+/+aHUSdqmJiMZFsc3I4sY0DXd/j5+0Oj8WjcwD4xxNjhGuJiO4M6kB/ev+YYFi6fssM88+0XIAYiOSM2DO2W+bzliEYoCDPMWLZAvsy1MIN6DEtO8oCfGTJN7yDK2GGI3wVTFnW0x5fDZebERb4+mW4dl9v1YZvks66WuHkI1fqvRIHAbLHFSi0YQfXJlwdWwyNEu7eKKrS4h9SCbX4h8RS6tMRucQr8oDg0zQ+zcI6Hf08jTYdamwzoFZaq+UdVNZRkl1b40h2Y33RMFj1NWytyHo2WilJN2aVSgnK6pESnN8lV67x9k+I734BSF+kvEXIWEUAAACLelRYdFNNSUxFUyByZGtpdCAyMDIyLjA5LjEAAHicbYwxCoAwDEWv4thCE9KYWoM4ddfBUTyAd/DwprqICn/4/P94pbhx9pNbl31zxVuKf/TmcBCRVUkCY5tTGChE7NjKZyeMzx3o5giTKLV9MJOy2DMwklTEiCz1siVpTxdz2eBtgR8NfD3wEvnjBCvyLmw5t8JHAAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with_sime2[0][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[08:23:48] Molecule does not have explicit Hs. Consider calling AddHs()\n", + "[08:23:48] Molecule does not have explicit Hs. Consider calling AddHs()\n" + ] + }, + { + "data": { + "application/3dmoljs_load.v0": "
\n

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n jupyter labextension install jupyterlab_3dmol

\n
\n", + "text/html": [ + "
\n", + "

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n", + " jupyter labextension install jupyterlab_3dmol

\n", + "
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw3d(with_sime2[0][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[08:23:49] Molecule does not have explicit Hs. Consider calling AddHs()\n", + "[08:23:49] Molecule does not have explicit Hs. Consider calling AddHs()\n" + ] + }, + { + "data": { + "application/3dmoljs_load.v0": "
\n

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n jupyter labextension install jupyterlab_3dmol

\n
\n", + "text/html": [ + "
\n", + "

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n", + " jupyter labextension install jupyterlab_3dmol

\n", + "
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw3d(with_onssi[0][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAASLklEQVR4nO3de1TUZf4H8PcMtxAYIRADxUIgxONdVi2N44U0Ny1LcV3XTFMH199G+zun01Dtb/Wsnt1hyz3UabczbKnVbltgauS6bpibmWkBZaumCZgiN+M+MILcnt8fM0IgTMDA92G+836d/kjnkflwsjef5/J9RiOEABER9ZdWdgFERM6NMUpE5BDGKBGRQxijREQOYYwSETnEXXYBRDTAzpw5o9VqIyMjvby8ZNfiEtiNEqnNs88+O2HChMOHD8suxFUwRonUJi8vD0BkZKTsQlyFhsfvidSktbV12LBhLS0t9fX13t7esstxCexGiVTlypUrTU1No0ePZoYqhjFKpCr5+fngjF5ZjFEiVWGMKo8xSqQqjFHlMUaJVIUxqjzGKJGqMEaVxwNPROrR1tbm4+Nz48aNuro6Hx8f2eW4CnajROpx9erVxsbG0NBQZqiSGKNE6sHnl6RgjBKph3VhNCoqSnYhroUxSqQeBQUFACIiImQX4loYo0TqwW16KRijROrBtVEpeOCJSCWEED4+Pg0NDWaz2c/PT3Y5LoTdKJFKFBUVNTQ0hISEMEMVxhglUgkujMrCGCVSCcaoLIxRIpWwxihPOymPMUqkEjx7LwtjlEglOKmXhQeeiNRACOHn52exWGpqaoYPHy67HNfCbpRIDUpLSy0WS3BwMDNUeYxRIjXgjF4ixiiRGlgfA+X+khSMUSI14N1OEjFGlVNZWblz5862tjbZhZAKcVIvEWNUOY899tjTTz+9dOnS6upq2bWQ2vBuJ4kYo8p55plnRo4ceejQoSlTpmRnZ8suh1TFOqlnjErBGFXO3Llzc3JyZs6cWVhYGBcX9/rrr8uuiFSirKysrq4uKCgoICBAdi2uiDGqqNGjRx87dkyv1zc2Nm7cuDExMbGpqUl2UeT0uDAqF2NUaV5eXiaT6Y033vD29k5LS7v33nuvXLkiuyhyboxRuRijcqxdu/bEiRPh4eG5ubmxsbFHjhyRXZE0zc2orkZ1NXrqy62vtrYqW5ZT4d1OcjFGpZk6dWp2dvbChQsrKioeeOCBlJQU17zf4OhR3H47br8dq1d382prq+3Vb79VvDLnwbud5GKMyhQYGHj48GGj0SiESE5OXrZsWW1treyipHnvPXzwgewinBMn9XIxRiXTaDQGg+H999/39/fPzMycMWPGuXPnZBclgbs7ACQl4fp12aU4IcaoXIzRIWHJkiXZ2dkTJ068ePHirFmzMjIyZFektPBwzJ+Py5exfbvsUpxNeXl5bW2tv79/YGCg7FpcFGN0qIiMjPz888/XrVtXX1+/cuXKxMTE5uZm2UUpymiEVoudO+GS7Xj/cWFUOsboEOLt7b17926TyeTp6ZmWlhYfH19WVia7KOX85CdYuxbNzdi8GS652dZPfAxUOsbokKPX6z/66KOQkJBPPvkkNjb25MmTsitSzh/+AJ0On36KXbtkl+I8rI+BshuViDE6FM2ZMyc3N3fOnDnFxcULFiwo3b1bdkUKueMObNsGAM88g/JyycU4Cx4alY4xOkSFhIQcPXo0KSnp/2bMCHniCaxZ4yJ72E8+iUmTUFUFg0F2KU6C2/TSMUaHLg8Pj5deeunZxET4+ODvf8fs2bh0SXZRg87dHX/5CzQa7NmDL76QXY0z4BaTdIzRIe/nP0dODmJicPo0pk7FgQOyCxoY336LY8e6f2n2bKxbByHw1FPca/oRlZWVVVVVOp1uxIgRsmtxXYxRZzBuHD7/HMuXw2zGo48iORlOe4V+UxMyMnD//YiJwcaNPabkCy8gKAinTuHdd5Wtz9mwFR0KGKNOws8PGRm2o5UpKVi6FM52hX5+PgwGhIVh5UocOQIfH8yfD4ul+8GBgdixAwCef17JGp0PF0aHAsao89BoYDDgyBEEB+PQIUyZAme4Qr+1FUeOYOVKjBuHP/4R33+PmBgYjSgshMkEX98e/+CmTZg1C10uEbx2jdP8ThijQwFj1NnMnYucHMycicJCxMUN5QOWxcVISUF4OO6/HxkZ8PBAQgKysvDNNzAY8KPXtGu1eOUVuLl1/E51NWbPxrJlcOH7W7pijA4FjFEnFBaGY8ewaRMaG7FhAxITe7yqU4a2Nlv7eeedSE7G1asYNw5GI4qKkJ6O+Pg+fKnp05GY2PHLb75BRQUyM3HPPbhwYcALd0qM0aFA45p3XKrEm29i82Y0NCA2Fnv34s475ZZTUoK33sKrr9pm4l5eeOgh6PVYsAAaTY9/qqwMx4/D1xeLF3fzam0tPvwQABYtgk6HvDwsX44zZ+Dri127kJAwON+J8xgxYkRFRUVpaekdd9whuxbXxRh1cl9+iRUr8N13CArCP/7Rt2ZvoLS2ZmfV/O7Pgf/6l+2O+pgYJCZi7dofn7n3Q0MDtmzBnj3QaPDkk3jxRXh4DPy7OIWampqAgABfX1+z2ayx85OKBhkn9U5u2jRkZ2PhQlRU4IEHkJKi6BZMaSlSUhAZOfJ3/3PwINzcbKuf587hqacGJUMBeHtj926YTHB3x8svIz4ernR/SyftM3pmqFyMUecXGIhDh7B1K4RAcrISWzBtbTh8GI88gjFjkJyMy5fDzOde/lNLcbFt9VOB/6n1ehw9ipAQfPIJYmPhSve3dLDeS+v2w204koExqgpubti2De+/D39/ZGZixozBurPz2jWkpCAqCosX48ABaLXW/lNz5r9P/q97UNCgvGdP5szB6dOYPx/FxYiLQ0qKou8+FOTk5ADw9PSUXYir49qouly8iEcfxblz8PNDdjaiozu9WlSEjz9GSQkaGhAYiGnTMGOG7eM77BMCH32EtDQcOADrZdKjR2PDBmzZguDgQflGeq2lBb/5jS1Df/ELpKVh2DC5FSlk7969a9eubWhoePzxx/fs2SO7HNcmSGWuXxfr1omHHxZtbR2/eeGCePBBAXT9JzRUvPaava9WVSVMJjF+vG28m5uIjxfp6aKlZbC/jz55+23h4yMAMWWKKCiQXc1gamxsTE9Pv/PmqQwPD49Tp07JLsrVMUZVqrGx499PnRIBAQIQ/v7il78Ur70m/vY38fvfi2nTbOH4q1918xVycoReL7y9bWNGjRIGg7hyRbHvoIvnnhMnTtgbcP68iIkRgNDpxP79SpWloLNnzyYlJfn7+1sDVKPRTJ48+dChQ7LrIsao6lVXi7AwAYg5c0R5eaeX2trEiy8KjUYAYvfujvEmk5gwwZaeWq2t/WxuVrz0DgcOCEB4eopXXrE3zGwWy5cLQGg0wmAQra1K1TeYrO1nfHx8+3b89OnTX3311fIu/zVJHsao2m3fLgAxYkTXDG2n19uazaYmIYQwGGwBGhYmtm0TRUVKFtuT5mZhMNgCf/VqUV/f48i2NmE0Cjc3AYif/lRUVSlY5UC7ePGiwWAIurlzp9Pp9Hr9V199Jbsu6ooxqnYREQIQzz/f44ArV4RWKwDxz38KIURBgXjwQZGZOdRWP4UQ+/cLnU4AYvJkkZ9vb+R//iOCgwUgxowRX3yhVH0D5MaNG7e2nyaTqa6uTnZp1D3GqKqVlNhay+PH7Q2bNEkA4rnnlCqr/y5csG136XRi3z57IwsLxcyZAhC33SZef12p+hyTl5dnMBiCbx5+8PPz0+v1ubm5suuiH8EYVbXjx20x+v339oatWiUA8bOfKVWWQ8xmsWJFrxZAGxrEE0/YRv72twdu3LihYJl90NLSkpWVtWTJkvb2c/z48UajscqplyRcCWNU1Q4etMWo/QTZvFkAYtEipcpyVFubSE0V7u4CEIsXi8pKe4PT0sSCBacA3HPPPUVDY6m3XVFRkdFoDAsLs6bnbbfdlpCQkJWVJbsu6hvGqKr9+9+2GL1+3d6wDRsEIJYuVaqsgfHxx2LkSNtmmP0F0Nzc3PDwcABBQUFDIaRaW1uzsrISEhLan+McN26c0WistP8DgYYqxqiqZWfbYtT+ec+HHhKAePxxhaoaOFevilmzbAugf/2rvZEVFRULFy4E4ObmZjQa2374bIKCiouLjUbjmDFjrOnp5eXF9lMFGKOqVl9vm/pmZtobZj1YunOnUmUNpMZGkZRk+2Hx2GP22u6WlpatW7dqtVoADz/8cE1NjWJFtref7jcfvY2OjjYajTz7qQ6MUbWbPftHOs2TJ20hdPq0clUNtDffFMOGCUBMny6++87eyA8++MD6IFB0dPTZs2cHu7CSkhKj0XjXXXd1aT9ltcM0GBijavfWWwIQHh7dp2RLi5g7VwDi3nsVr2yAffmlCA8XgAgKEh9+aG/kxYsXJ06cCMDX1zc9PX0wimlvPz1u3ikdFRVlNBq/t39kgpwTY1TtWltFXJwAREiIOHas00uVlWLlSlvIquJwYkWFWLRIAOK++17bunWrnY7v+vXr69atsz6ZnpSU1GR9gmsglJaWGo1G644WAE9PT7afqscYdQFlZWLqVNv5ydhYsXmz+PWvxSOPCD8/AQgvLzE4HZkULS3ihRfOWw9gLl++3Gw22xlsMpms3WJcXFxpaakj73tr+xkZGWk0Gq9du+bIlyWnwBh1DRaL2LFDhIZ2uiXPy0ssWyYGf31QeQcPHgwICABw9913nzlzxs7I48ePh4SEABg1atRnn33Wj/eqqqpKTU2NiIiwpqebm1t8fHx6enrL0HualgYJY9TFnD8vjhwRmZni5El7N3w4v7y8vEmTJlkXQN999107I69duzZv3jzr/k9qamrv3yInJ0ev13t7e1sDdNSoUQaDobCw0OHayckwRkm1Ghoa1q9fb804vV5vZwG0ubnZYDBYR65Zs8Zisdj5stXV1SaTacKECdbxWq2W7aeLY4ySyplMJuunFd133332F0DffvttHx8fAFOmTCno7g59a/s57OanlISGhhoMhsuXLw9a7eQcGKOkfp9++mloaKh13n3C7h3658+fj4mJsV7uuf/mHfo1NTUmk8m6RPDD9rNZ6lXWNHTwI+3IJZSXl69atero0aPu7u47duxon8Lfymw2r1+/ft++fRqNZs2aNQAyMjIaGxsBjBo1asOGDRs3bmy/TIQI4Efakcv44QLo6tWr63veYWtra9u+fbtWq7XePN/efg7g8VJSE3aj5FreeeedTZs21dfXT548+b333ms/qHSrhISEvXv3zps3b9euXe1PcxLdSiu7ACJFrVq1KicnZ/z48V9//fW0adP279/f08iWlhYAW7ZsYYaSfYxRcjnR0dEnT55ctmyZ2WxesWJFSkpKt8Py8vIAREZGKlsdOR/GKLkinU63b9++1NRUNze3boNSCHHp0iUAdmb9RFZcGyWXdunSpbFjx976+1evXh0zZkxISEhJSYnyVZFzYTdKLq3bDAWQn58PzuipdxijRN1gjFLvMUaJumGNUS6MUm8wRom6YY3RqKgo2YWQE2CMEnWDk3rqPe7UE3UlhPDz87NYLDU1NcOHD5ddDg117EaJuiotLbVYLMHBwcxQ6g3GKFFXnNFTnzBGibqyPgbK/SXqJcYoUVcFBQXgaSfqNcYoUVec1FOfMEaJuuLdTtQnPPBE1JVOp6urq6uqqrJ+2D2RfexGiTopKyurq6sLCgpihlIvMUaJOuHCKPUVY5SoE8Yo9RVjlKgT3u1EfcUYJeqEdztRXzFGiTrhpJ76igeeiDrx9/evra2tqKgIDAyUXQs5B3ajRB3Ky8tra2v9/f2ZodR7jFGiDlwYpX5gjBJ14GOg1A+MUaIO1rud2I1SnzBGiTrw0Cj1A2OUqANPO1E/MEaJOnCLifqB50aJbCorK4OCgnQ6XW1trexayJmwGyWyYStK/cMYJbLhwij1D2OUyIYxSv3DGCWyYYxS/zBGiWwYo9Q/jFEiG8Yo9Q8PPBEBQE1NTUBAgK+vr9ls1mg0ssshZ8JulAj4QSvKDKW+YowSAbzbiRzAGCUCePaeHMAYJQJuXpHHu52oHxijRAC36ckBjFEigGuj5AAeeCJCXV2dTqcbNmxYfX09d+qpr9iNEnW0osxQ6gfGKBEXRskhjFEixig5hDFKxE+yI4cwRol49p4cwhgl4qSeHMIDT+TqLBaLn5+fl5eXxWLRatlYUJ/xLw25uvz8fCFEREQEM5T6h39vyNXx+SVyEGOUXB33l8hBXBslV1dfX5+Xlzd8+PCxY8fKroWcEmOUiMghnNQTETmEMUpE5BDGKBGRQxijREQOYYwSETnk/wFF9Q7vAtXCxQAAALJ6VFh0cmRraXRQS0wgcmRraXQgMjAyMi4wOS4xAAB4nHu/b+09BiDgZYAARiBmB2I2IG5gZGNIAIkxQ2gmJjYGDSDNzMIOoZk5IDQTqjpGZm6gOYxMDEzMGkzMLAosrAxAFSIM4nFQC8CAPXS9oMMD+0P2IE6ILYvDui4WWxB7xSkd+xx79v0g9r/pjPuV75+yA7GDH37aL5GksQ/M5nQ9UDHNCazXg/Wd3YK7DAdAbDEAACcflHqHbo8AAAELelRYdE1PTCByZGtpdCAyMDIyLjA5LjEAAHicfVFLTsQwDN33FL5AI38SO1mwmLbDCKFpJRi4A3vuL5yikhkpwkkkO362814GqPa2vH59w5/xMgwA+M8upcCnIOJwherAdL68rDDfTtNxM28f6+0dDNQrfD0iT7ftetwQzMCBTYUVKCSyZBEw4G6tknccqnKUmhaWVDo4cRwGtVjUYMSQOAtxBxhh3fOozMnzUpSlNznBBiOFXCwh1RLSGMU6SPXZowS0rKV2MkXC3AHa/sioMecIo9MiSdzreF6XB7V+9Zu2dWn61cVNJg9AmhoeONX5+fLEjTn5SY0f+dFGgrzE7l9wP6/Gx0+7P/wAUL9tx414EZAAAACGelRYdFNNSUxFUyByZGtpdCAyMDIyLjA5LjEAAHicFY05DsMwDAS/ktIGKII3KQiu2Ccf8uMjtTM72O6++n6+v/68l6BkqAKjc3rCEqQIMSAkFfUqWISRNiNhELqUshxmYVUGYwesrrCOpRDx3eoMUduMsWb6KTnMdB8MRcqKuVcZxFRwv3/C2h3rm7uNYgAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with_meox[0][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "with open('NIST_ONSSiMe3.txt','w') as f:\n", + " for m in with_onssi:\n", + " f.write(m[0]+'\\n')\n", + " \n", + "with open('NIST_SiMe3.txt','w') as f:\n", + " for m in with_sime3:\n", + " f.write(m[0]+'\\n')\n", + " \n", + "with open('NIST_MeOX.txt','w') as f:\n", + " for m in with_meox:\n", + " f.write(m[0]+'\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Run the in-silico derivatization\n", + "\n", + "Iterate over the `mol[]` list (read from file above), remove derivatization groups from each entry, and try derivatization several times to leverage from the probabilistic behaviour). Assemble the results.\n", + "\n", + "This can be time consuming, expect about 5,000 entries per minute per core. Memory consumption can also grow to several GB." + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def process_one_mol(mol):\n", + " return (\n", + " mol[0],\n", + " Chem.MolToSmiles(remove_derivatization_groups(mol[1])),\n", + " { Chem.MolToSmiles(add_derivatization_groups(mol[1])) for _ in range(42) }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.99 ms, sys: 23.6 ms, total: 30.5 ms\n", + "Wall time: 45.4 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "with ProcessPoolExecutor(max_workers=cpus) as executor:\n", + " out = executor.map(process_one_mol, mols)\n", + " \n", + "out = list(out)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Write the main outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "with open('derivs_struct.tsv','w') as tsv:\n", + " tsv.write(\"orig\\tderiv. removed\\tderiv. added ...\\n\")\n", + " for orig,removed,added in out:\n", + " tsv.write(\"\\t\".join([orig,removed,*added]) + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "with open('derivs_flat.txt','w') as flat:\n", + " for orig,removed,added in out:\n", + " for one in { orig, removed, *added }:\n", + " flat.write(one + \"\\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "f8c929fcd037834ef32e0d432f6eb299eb751178a7a29a50d579d391b6611298" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/example/plotting.py b/example/plotting.py new file mode 100644 index 0000000..94c1884 --- /dev/null +++ b/example/plotting.py @@ -0,0 +1,22 @@ +from typing import Optional + +import py3Dmol +from rdkit.Chem import AllChem, Mol, MolToMolBlock +from rdkit.Chem.Draw.IPythonConsole import ShowMols + + +def draw3d(m: Mol, dimensions: tuple[int, int] = (500, 300), p: Optional[py3Dmol.view] = None): + AllChem.EmbedMultipleConfs(m, clearConfs=True, numConfs=50) + opt = AllChem.MMFFOptimizeMoleculeConfs(m) + conf = min(range(len(opt)), key=lambda x: opt[x][1] if opt[x][0] == 0 else float("inf")) + mb = MolToMolBlock(m, confId=conf) + + if p is None: + p = py3Dmol.view(width=dimensions[0], height=dimensions[1]) + + p.removeAllModels() + p.addModel(mb, 'sdf') + p.setStyle({'stick': {}}) + p.setBackgroundColor('0xeeeeee') + p.zoomTo() + return p.show() diff --git a/gc_meox_tms/__init__.py b/gc_meox_tms/__init__.py new file mode 100644 index 0000000..a1f549c --- /dev/null +++ b/gc_meox_tms/__init__.py @@ -0,0 +1,4 @@ +from .derivatization import (add_derivatization_groups, is_derivatized, + process_one_mol, remove_derivatization_groups) + +__all__ = ["add_derivatization_groups", "is_derivatized", "process_one_mol"] diff --git a/gc_meox_tms/__main__.py b/gc_meox_tms/__main__.py new file mode 100755 index 0000000..efad62c --- /dev/null +++ b/gc_meox_tms/__main__.py @@ -0,0 +1,43 @@ +import argparse +import sys +from concurrent.futures import ProcessPoolExecutor +from functools import partial + +from .derivatization import process_one_mol +from .utils import read_input_txt, write_flat, write_tab_separated + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('-n', '--ncpu', type=int, action='store', help='# of cores to use', default=1) + parser.add_argument('-r', '--repeat', type=int, action='store', + help='# of repeated attempts to derivatize (may return different results)', default=42) + parser.add_argument('-k', '--keep', action='store_true', + help='keep input and stripped derivatization SMILES in output', default=False) + parser.add_argument('-f', '--flat', type=str, action='store', help='flat output file, one SMILES per line') + parser.add_argument('-t', '--tsv', type=str, action='store', + help='structured output tsv file (original, stripped derivatization, added derivatizations') + parser.add_argument('infiles', nargs='+', type=str, action='store', help='input files') + + return parser.parse_args(argv) + + +def main(argv): + args = parse_arguments(argv) + input_molecules = read_input_txt(args.infiles) + + process_one_mol_with_repeats = partial(process_one_mol, repeats=args.repeat) + with ProcessPoolExecutor(max_workers=args.ncpu) as executor: + data = executor.map(process_one_mol_with_repeats, input_molecules) + + if args.flat: + write_flat(args.flat, data, args.keep) + if args.tsv: + write_tab_separated(args.tsv, data) + + return 0 + + +if __name__ == '__main__': + main(argv=sys.argv[1:]) diff --git a/gc_meox_tms/derivatization.py b/gc_meox_tms/derivatization.py new file mode 100755 index 0000000..d1c6ced --- /dev/null +++ b/gc_meox_tms/derivatization.py @@ -0,0 +1,138 @@ +import random +from copy import deepcopy +from typing import Optional, Tuple + +from rdkit import Chem +from rdkit.Chem import AllChem + +tms = '[Si]([CH3])([CH3])[CH3]' + +# XXX: ~[O,N,S] would match more than we aim to (-O, -S, -N, =N) but it's unlikely to happen +tms_match = Chem.MolFromSmarts('*~[O,N,S]' + tms) +tms_match0 = Chem.MolFromSmarts('[#0]([CH3])([CH3])[CH3]') + +meox_match_co = Chem.MolFromSmarts('C([C,c])([C,c])=NO[CH3]') +meox_match_cho = Chem.MolFromSmarts('[CH]([C,c])=NO[CH3]') +meox_match0 = Chem.MolFromSmarts('[#0]=NO[CH3]') +co = Chem.MolFromSmiles('C=O') + + +def is_derivatized(mol: Optional[Chem.Mol] = None, smiles: Optional[str] = None) -> bool: + """ + Return whether a molecule is derivatized by searching for MeOX and TMS substructures within that molecule. + + :param mol: RDKit molecule object + :param smiles: SMILES string + + :return: True if derivatized, False otherwise + """ + if mol is None: + mol = Chem.MolFromSmiles(smiles) + mol = Chem.AddHs(mol) + return (mol.HasSubstructMatch(tms_match) or + mol.HasSubstructMatch(meox_match_co) or + mol.HasSubstructMatch(meox_match_cho)) + + +def remove_derivatization_groups(mol: Optional[Chem.Mol] = None, smiles: Optional[str] = None) -> Chem.Mol: + """ + If a molecule is derivatized, remove derivatization substructures and return the original underivatized molecule. + + :param mol: RDKit molecule object + :param smiles: SMILES string + + :return: RDKit molecule object in underivatized (original) form + """ + if mol is None: + em = Chem.MolFromSmiles(smiles) + else: + em = deepcopy(mol) + + matches = em.GetSubstructMatches(tms_match) + for ma in matches: + em.GetAtomWithIdx(ma[2]).SetAtomicNum(0) + + em = AllChem.DeleteSubstructs(em, tms_match0) + + matches = em.GetSubstructMatches(meox_match_co) + for ma in matches: + em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) + matches = em.GetSubstructMatches(meox_match_cho) + for ma in matches: + em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) + + em = AllChem.ReplaceSubstructs(em, meox_match0, co, replaceAll=True)[0] + Chem.SanitizeMol(em) + return em + + +# (match pattern, dummy atom #, probability) +_subs = [ + ('[OH]', [100], [.95]), + ('[SH]', [101], [.80]), + # matches also imine + ('[NH]', [102], [.50]), + ('[NH2]', [103, 102], [.25, .5]), + ('C([C,c])([C,c])=O', [104], [.90]), + ('[CH]=O', [104], [.90]), +] + +# (dummy atom #, replacement) +_repls = [ + ('[#100]', 'O' + tms), + ('[#101]', 'S' + tms), + ('[#102]', 'N' + tms), + ('[#103]', f'N({tms}){tms}'), + ('[#104]=O', 'C=NO[CH3]'), +] + +subs = [(Chem.MolFromSmarts(pat), repls, probs) for pat, repls, probs in _subs] +repls = [(Chem.MolFromSmarts(pat), Chem.MolFromSmiles(repl)) for pat, repl in _repls] + + +def add_derivatization_groups(mol: Optional[Chem.Mol] = None, smiles: Optional[str] = None) -> Chem.Mol: + """ + Add derivatization substructures to a molecule and return its derivatized form. This function is not deterministic + and will return a random derivatized form of the molecule. Run multiple times to get all possible derivatized forms. + + :param mol: RDKit molecule object + :param smiles: SMILES string + + :return: RDKit molecule object in a derivatized form + """ + if mol is None: + mol = Chem.MolFromSmiles(smiles) + + em = deepcopy(mol) + + for pat, reps, probs in subs: + matches = em.GetSubstructMatches(pat) + for m in matches: + r = random.random() + for repl, prob in zip(reps, probs): + if r < prob: + em.GetAtomWithIdx(m[0]).SetAtomicNum(repl) + break + + for pat, repl in repls: + em = AllChem.ReplaceSubstructs(em, pat, repl, replaceAll=True)[0] + + Chem.SanitizeMol(em) + return em + + +def process_one_mol(mol: Tuple[str, Chem.Mol], repeats: int): + """ + Return derivatized and underivatized forms of one molecule. Since underlying function is not deterministic, this + function may or may not return all possible derivatized forms of the molecule depending on the number of repeats. + + :param mol: SMILES string or RDKit molecule object + :param repeats: number of repeats to simulate molecule derivatization + + :return: tuple containing the input molecule, its underivatized form, and a set of derivatized forms + """ + return ( + mol[0], + Chem.MolToSmiles(remove_derivatization_groups(mol[1]), kekuleSmiles=True), + {Chem.MolToSmiles(add_derivatization_groups(mol[1]), kekuleSmiles=True) for _ in range(repeats)} + ) diff --git a/gc_meox_tms/utils.py b/gc_meox_tms/utils.py new file mode 100644 index 0000000..91412e9 --- /dev/null +++ b/gc_meox_tms/utils.py @@ -0,0 +1,47 @@ +import fileinput +from os import PathLike +from typing import List, Tuple + +from rdkit.Chem import Mol, MolFromSmiles + + +def read_input_txt(infiles: PathLike) -> List[Tuple[str, Mol]]: + """ + Read input from txt files with SMILES. + + :param infiles: Path to input file(s) with SMILES. One SMILES per line. + + :return: List of tuples (molecule string from the input file, RDKit molecule object of that molecule) + """ + return [(line.rstrip(), MolFromSmiles(line)) for line in fileinput.input(files=infiles)] + + +def write_tab_separated(tsv_path: PathLike, data) -> None: + """ + Write output to a tab-separated file. + + :param tsv_path: Path to output file. + :param data: Tuple of (original SMILES, underivatized SMILES, set of derivatized SMILES) + """ + with open(tsv_path, "w") as tsv: + tsv.write("orig\tderiv. removed\tderiv. added ...\n") + for orig, removed, added in data: + tsv.write("\t".join([orig, removed, *added]) + "\n") + + +def write_flat(txt_path: PathLike, data, keep: bool = False) -> None: + """ + Write output to a txt file with one SMILES per line. + + :param txt_path: Path to output file. + :param data: Tuple of (original SMILES, underivatized SMILES, set of derivatized SMILES) + :param keep: Whether to write the original and underivatized SMILES to the output. + """ + with open(txt_path, "w") as flat: + if keep: + for orig, removed, added in data: + for one in {orig, removed, *added}: + flat.write(one + "\n") + else: + for orig, removed, added in data: + flat.write("\n".join(added) + "\n") diff --git a/pyproject.toml b/pyproject.toml index 374b58c..2e0ead7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,42 @@ -[build-system] -requires = [ - "setuptools>=42", - "wheel" -] -build-backend = "setuptools.build_meta" +[build-system] +requires = ["setuptools>=61.2", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "gc-meox-tms" +version = "1.0.1" +authors = [ + {name = "Ales Krenek", email = "ljocha@ics.muni.cz"}, + {name = "Maksym Skoryk", email = "maksym.skoryk@recetox.muni.cz"}, +] +maintainers = [{name = "RECETOX", email = "GalaxyToolsDevelopmentandDeployment@space.muni.cz"}] +description = "In-silico MeOX/TMS derivatization of chemical compounds" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +urls = {Homepage = "https://github.com/RECETOX/gc-meox-tms"} +requires-python = ">=3.8" +dependencies = ["rdkit"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.optional-dependencies] +dev = [ + "flake8", + "mock", + "pytest", + "pytest-cov", + "pytest-rerunfailures", +] +eda = [ + "jupyter", + "py3Dmol", +] + +[tool.setuptools] +packages = ["gc_meox_tms"] +include-package-data = false diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..3225598 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +flake8 +isort +mock +pytest +pytest-cov +pytest-rerunfailures diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2924aa1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +rdkit diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 570a139..0000000 --- a/setup.cfg +++ /dev/null @@ -1,27 +0,0 @@ -[metadata] -name = gc-meox-tms -version = 0.0.1 -author = Ales Krenek -author_email = ljocha@ics.muni.cz -description = In-sillico derivatization of compounds to be identified in gass chromatography -long_description = file: README-package.md -long_description_content_type = text/markdown -url = https://github.com/ljocha/gc-derivatization -project_urls = -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: MIT License - Operating System :: OS Independent - -[options] -package_dir = - = src -packages = find: -python_requires = >=3.6 -scripts = - bin/gc-meox-tms.py -install_requires = - rdkit-pypi - -[options.packages.find] -where = src diff --git a/src/gc_meox_tms/__init__.py b/src/gc_meox_tms/__init__.py deleted file mode 100644 index 6f0f1b5..0000000 --- a/src/gc_meox_tms/__init__.py +++ /dev/null @@ -1,98 +0,0 @@ -from rdkit import Chem -from rdkit.Chem import AllChem -from copy import deepcopy -import random - -tms = '[Si]([CH3])([CH3])[CH3]' - -# XXX: ~[O,N,S] would match more than we aim to (-O, -S, -N, =N) but it's unlikely to happen -tms_match = Chem.MolFromSmarts('*~[O,N,S]' + tms) -tms_match0 = Chem.MolFromSmarts('[#0]([CH3])([CH3])[CH3]') - -meox_match_co = Chem.MolFromSmarts('C([C,c])([C,c])=NO[CH3]') -meox_match_cho = Chem.MolFromSmarts('[CH]([C,c])=NO[CH3]') -meox_match0 = Chem.MolFromSmarts('[#0]=NO[CH3]') -co = Chem.MolFromSmiles('C=O') - -def is_derivatized(mol=None,smiles=None): - if mol is None: - mol = Chem.MolFromSmiles(smiles) - mol = Chem.AddHs(mol) - return mol.HasSubstructMatch(tms_match) or mol.HasSubstructMatch(meox_match_co) or mol.HasSubstructMatch(meox_match_cho) - -def remove_derivatization_groups(mol=None,smiles=None): - if mol is None: - em = Chem.MolFromSmiles(smiles) - else: - em = deepcopy(mol) - - matches = em.GetSubstructMatches(tms_match) - for ma in matches: - em.GetAtomWithIdx(ma[2]).SetAtomicNum(0) - - em = AllChem.DeleteSubstructs(em,tms_match0) - - matches = em.GetSubstructMatches(meox_match_co) - for ma in matches: - em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) - matches = em.GetSubstructMatches(meox_match_cho) - for ma in matches: - em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) - - em, = AllChem.ReplaceSubstructs(em,meox_match0,co,replaceAll=True) - Chem.SanitizeMol(em) - return em - -# (match pattern, dummy atom #, probability) -_subs = [ - ('[OH]', [100], [.95]), - ('[SH]', [101], [.80]), -# matches also imine - ('[NH]', [102], [.50]), - ('[NH2]', [103,102], [.25, .5]), - ('C([C,c])([C,c])=O', [104], [.90]), - ('[CH]=O', [104], [.90]), -] - -# (dummy atom #, replacement) -_repls = [ - ('[#100]', 'O'+tms), - ('[#101]', 'S'+tms), - ('[#102]', 'N'+tms), - ('[#103]', f'N({tms}){tms}'), - ('[#104]=O', 'C=NO[CH3]'), -] - -#repls = list(zip( -# map(lambda n: Chem.MolFromSmarts(f'[#{n}]'),_repls), -# map(Chem.MolFromSmiles,_repls.values()) -#)) - -subs = [ (Chem.MolFromSmarts(pat), repls, probs) for pat,repls,probs in _subs] -repls = [ (Chem.MolFromSmarts(pat), Chem.MolFromSmiles(repl)) for pat,repl in _repls] - - - -def add_derivatization_groups(mol=None,smiles=None): - if mol is None: - mol = Chem.MolFromSmiles(smiles) - - em = deepcopy(mol) - - for pat,reps,probs in subs: - matches = em.GetSubstructMatches(pat) -# print(matches) - for m in matches: - r = random.random() - for repl,prob in zip(reps,probs): - if r < prob: - em.GetAtomWithIdx(m[0]).SetAtomicNum(repl) - break - - for pat,repl in repls: -# print(Chem.MolToSmiles(pat),Chem.MolToSmiles(repl),Chem.MolToSmiles(em)) - em, = AllChem.ReplaceSubstructs(em,pat,repl,replaceAll=True) - - Chem.SanitizeMol(em) - return em - diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c3244a6 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="module") +def test_dir(request): + """Return the directory of the currently running test script.""" + return Path(request.fspath).parent diff --git a/tests/data/acidic_protons.txt b/tests/data/acidic_protons.txt new file mode 100644 index 0000000..17d1453 --- /dev/null +++ b/tests/data/acidic_protons.txt @@ -0,0 +1,2 @@ +CC(=O)O +C(C(C(=O)O)N)S diff --git a/tests/data/alcohols.txt b/tests/data/alcohols.txt new file mode 100644 index 0000000..5a2e4f3 --- /dev/null +++ b/tests/data/alcohols.txt @@ -0,0 +1,2 @@ +CCO +CO \ No newline at end of file diff --git a/tests/data/aldehydes.txt b/tests/data/aldehydes.txt new file mode 100644 index 0000000..c87890b --- /dev/null +++ b/tests/data/aldehydes.txt @@ -0,0 +1,3 @@ +CC=O +COC1=C(C=CC(=C1)C=O)O +C(C1C(C(C(C(O1)O)O)O)O)O diff --git a/tests/data/ketones.txt b/tests/data/ketones.txt new file mode 100644 index 0000000..3332c72 --- /dev/null +++ b/tests/data/ketones.txt @@ -0,0 +1,5 @@ +CC(=O)C +CCC(=O)C +CC(=O)CC(=O)C +CC1CCCCCCCCCCCCC(=O)C1 +C1CCC(=O)CC1 diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..1116eed --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,85 @@ +from inspect import signature +from os.path import exists, join +from random import sample + +import pytest +from mock import ANY, patch + +from gc_meox_tms.__main__ import main + + +@pytest.fixture +def input_data_path(test_dir): + """Return the directory of the currently running test script as string type.""" + return join(test_dir, 'data/aldehydes.txt') + + +@pytest.fixture(params=[['-f'], ['-t'], ['-f', '-t']]) +def output_params(request, tmp_path): + """Return a list of output parameters.""" + args = [] + flat_path, tsv_path = None, None + for flag in request.param: + if flag == '-f': + args.append(flag) + args.append(flat_path := join(tmp_path, 'flat.txt')) + elif flag == '-t': + args.append(flag) + args.append(tsv_path := join(tmp_path, 'tsv.txt')) + yield args, flat_path, tsv_path + + +def test_cli_finishes(input_data_path): + """Test if the main function works.""" + args = [input_data_path] + exit_code = main(args) + + assert exit_code == 0 + + +def test_cli_writes_files(input_data_path, output_params, tmp_path): + args = output_params[0] + args.append(input_data_path) + + flat_path = output_params[1] + tsv_path = output_params[2] + exit_code = main(args) + + assert exit_code == 0 + assert flat_path is None or exists(flat_path) + assert tsv_path is None or exists(tsv_path) + + +@pytest.mark.parametrize('keep', [True, False]) +@patch('gc_meox_tms.__main__.write_flat') +def test_keep_flag(mock, input_data_path, tmp_path, keep): + """Test if the main function works with -k flag.""" + flat_path = join(tmp_path, 'flat.txt') + args = [input_data_path, '-f', flat_path] + if keep: + args.append('-k') + + main(args) + + mock.assert_called_with(flat_path, ANY, keep) + + +@pytest.mark.parametrize('num_workers', sample(range(1, 10), 3)) +@patch('gc_meox_tms.__main__.ProcessPoolExecutor') +def test_ncpu_flag(mock, input_data_path, num_workers): + """Test if the main function works with -n flag.""" + args = [input_data_path, '-n', str(num_workers)] + main(args) + + mock.assert_called_with(max_workers=num_workers) + + +@pytest.mark.parametrize('repeats', sample(range(1, 50), 3)) +@patch('gc_meox_tms.__main__.ProcessPoolExecutor.map') +def test_repeats_flag(mock, input_data_path, repeats): + """Test if the main function works with -r flag.""" + args = [input_data_path, '-r', str(repeats)] + main(args) + + called_with_repeats = signature(mock.call_args[0][0]).parameters["repeats"].default + assert called_with_repeats == repeats diff --git a/tests/test_derivatization.py b/tests/test_derivatization.py new file mode 100644 index 0000000..85f250d --- /dev/null +++ b/tests/test_derivatization.py @@ -0,0 +1,135 @@ +import random + +import pytest +from rdkit import Chem + +from gc_meox_tms import (add_derivatization_groups, is_derivatized, + process_one_mol, remove_derivatization_groups) + + +@pytest.fixture(params=[ + ("CC(=O)N([Si](C)(C)C)[Si](C)(C)C", True), + ("C[Si](C)(C)OC1=CC=CC=C1", True), + ("C[Si](C)(C)OC1=CC=C(C=C1)O[Si](C)(C)C", True), + ("C[Si](C)(C)C1=CC=C(C=C1)[Si](C)(C)C", False), + ("CCO[Si](C)(C)C", True), + ("CC(=O)O[Si](C)(C)C", True), + ("CC(=O)O", False), + ("CCCS[Si](C)(C)C", True), + ("CCCS", False), + ("CCC(=NOC)C", True), + ("CC=NOC", True), + ("CCCC(=O)N", False), + ("CCCC(=O)NCC", False), + ("CC(=O)NOC", False), + ("CCC(O)C", False), + ("CCCC#N", False), + ("C[N+]#[C-]", False) +]) +def is_derivatized_data(request): + """Return a tuple of (smiles, boolean indicating if the molecule + is MeOX or TMS derivatized).""" + smiles, _is_derivatized = request.param + return smiles, _is_derivatized + + +@pytest.fixture(params=[ + ("CC(=O)N([Si](C)(C)C)[Si](C)(C)C", "CC(=O)N[Si](C)(C)C", "CC(N)=O"), + ("C[Si](C)(C)OC1=CC=CC=C1", None, "OC1=CC=CC=C1"), + ("C[Si](C)(C)OC1=CC=C(O[Si](C)(C)C)C=C1", "C[Si](C)(C)OC1=CC=C(O)C=C1", + "OC1=CC=C(O)C=C1"), + ("CCO[Si](C)(C)C", None, "CCO"), + ("CC(=O)O[Si](C)(C)C", None, "CC(=O)O"), + ("CCCS[Si](C)(C)C", None, "CCCS"), + ("CCC(C)=NOC", None, "CCC(C)=O"), + ("CC=NOC", None, "CC=O") +]) +def derivatization_groups_data(request): + """Return a tuple of (smiles of a derivatized molecule, smiles of this + molecule with different degree of conversion, smiles of the original + non-derivatized molecule).""" + derivatized, alternative, original = request.param + return derivatized, alternative, original + + +def test_is_derivatized_from_smiles(is_derivatized_data): + """Test if the is_derivatized function works with SMILES.""" + smiles, expected = is_derivatized_data + actual = is_derivatized(smiles=smiles) + + assert actual == expected + + +def test_is_derivatized_from_mol(is_derivatized_data): + """Test if the is_derivatized function works with RDKit molecules.""" + smiles, expected = is_derivatized_data + mol = Chem.MolFromSmiles(smiles) + actual = is_derivatized(mol=mol) + + assert actual == expected + + +def test_remove_derivatization_groups_from_smiles(derivatization_groups_data): + """Test if the remove_derivatization_groups function works with SMILES.""" + smiles, _, expected = derivatization_groups_data + actual = remove_derivatization_groups(smiles=smiles) + actual_smiles = Chem.MolToSmiles(actual, kekuleSmiles=True) + + assert actual_smiles == expected + + +def test_remove_derivatization_groups_from_mol(derivatization_groups_data): + """Test if the remove_derivatization_groups function works with RDKit + molecules.""" + smiles, _, expected = derivatization_groups_data + mol = Chem.MolFromSmiles(smiles) + actual = remove_derivatization_groups(mol=mol) + actual_smiles = Chem.MolToSmiles(actual, kekuleSmiles=True) + + assert actual_smiles == expected + + +def test_add_derivatization_groups_from_smiles(derivatization_groups_data): + """Test if the add_derivatization_groups function works with SMILES. The + test will run FLAKY_RERUNS times or until success due to + non-deterministic nature of add_derivatization_groups.""" + random.seed(3) + expected, alternative, original = derivatization_groups_data + derivatized = add_derivatization_groups(smiles=original) + derivatized_smiles = Chem.MolToSmiles(derivatized, kekuleSmiles=True) + + assert derivatized_smiles in [expected, alternative] + + +def test_add_derivatization_groups_from_mol(derivatization_groups_data): + """Test if the add_derivatization_groups function works with RDKit + molecules. The test will run FLAKY_RERUNS times or until success + due to non-deterministic nature of add_derivatization_groups.""" + random.seed(3) + expected, alternative, original = derivatization_groups_data + mol = Chem.MolFromSmiles(original) + derivatized = add_derivatization_groups(mol=mol) + derivatized_smiles = Chem.MolToSmiles(derivatized, kekuleSmiles=True) + + assert derivatized_smiles in [expected, alternative] + + +@pytest.mark.parametrize("smiles, expected", [ + ("CC(N)=O", {"CC(N)=O", + "CC(=O)N([Si](C)(C)C)[Si](C)(C)C", + "CC(=O)N[Si](C)(C)C"}), + ("C[Si](C)(C)OC1=CC=C(O)C=C1", {"OC1=CC=C(O)C=C1", + "C[Si](C)(C)OC1=CC=C(O[Si](C)(C)C)C=C1", + "C[Si](C)(C)OC1=CC=C(O)C=C1"}), + ("CCC(C)=O", {"CCC(C)=O", "CCC(C)=NOC"}), + ("CC=NOC", {"CC=O", "CC=NOC"}) +]) +def test_process_one_mol(smiles, expected): + """Test processing one molecule.""" + mol = (smiles, Chem.MolFromSmiles(smiles)) + n = 40 + random.seed(3) + actual = process_one_mol(mol, n) + actual = {actual[0], actual[1], *actual[2]} + + assert actual == expected diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..d251785 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,86 @@ +from concurrent.futures import ProcessPoolExecutor +from functools import partial + +import pytest +from rdkit.Chem import Mol, MolFromSmiles + +from gc_meox_tms import process_one_mol +from gc_meox_tms.utils import read_input_txt, write_flat, write_tab_separated + + +@pytest.fixture +def data(): + molecules = [(smiles, MolFromSmiles(smiles)) for smiles in [ + "CCC(=NOC)C", "CCC=NOC", "C=NOC", "CC(=O)N([Si](C)(C)C)[Si](C)(C)C"]] + + process_one_mol_with_repeats = partial(process_one_mol, repeats=1) + with ProcessPoolExecutor(max_workers=2) as executor: + data = executor.map(process_one_mol_with_repeats, molecules) + + yield data + + +@pytest.mark.parametrize("path, smiles", [ + ("data/acidic_protons.txt", ["CC(=O)O", "C(C(C(=O)O)N)S"]), + ("data/alcohols.txt", ["CCO", "CO"]), + ("data/ketones.txt", ["CC(=O)C", "CCC(=O)C", "CC(=O)CC(=O)C", "CC1CCCCCCCCCCCCC(=O)C1", "C1CCC(=O)CC1"]) +]) +def test_reading_input_from_txt(path, test_dir, smiles): + """Test reading input from txt files.""" + molecules = read_input_txt(test_dir / path) + actual_smiles = [mol[0] for mol in molecules] + rdkit_molecules = [mol[1] for mol in molecules] + + assert len(molecules) == len(smiles) + assert actual_smiles == smiles + assert all(isinstance(mol, Mol) for mol in rdkit_molecules) + + +def test_writing_flat_output(data, tmp_path): + """Test writing flat output.""" + flat_path = tmp_path / "flat.txt" + write_flat(flat_path, data, True) + + assert flat_path.exists() + + +def test_writing_flat_content(data, tmp_path): + """Test writing flat output content.""" + flat_path = tmp_path / "flat.txt" + write_flat(flat_path, data, True) + + with open(flat_path, "r") as f: + lines = f.readlines() + + assert len(lines) == 8 + + +def test_writing_flat_content_without_keep(data, tmp_path): + """Test writing flat output content without keep.""" + flat_path = tmp_path / "flat.txt" + write_flat(flat_path, data, False) + + with open(flat_path, "r") as f: + lines = f.readlines() + + assert len(lines) == 4 + + +def test_writing_tsv_output(data, tmp_path): + """Test writing tsv output.""" + tsv_path = tmp_path / "tsv.txt" + write_tab_separated(tsv_path, data) + + assert tsv_path.exists() + + +def test_writing_tsv_content(data, tmp_path): + """Test writing tsv output content.""" + tsv_path = tmp_path / "tsv.txt" + write_tab_separated(tsv_path, data) + + with open(tsv_path, "r") as f: + lines = f.readlines() + + assert len(lines) == 5 + assert lines[0] == "orig\tderiv. removed\tderiv. added ...\n"