diff --git a/.github/workflows/anaconda.yaml b/.github/workflows/anaconda.yaml new file mode 100644 index 0000000..f5d09ae --- /dev/null +++ b/.github/workflows/anaconda.yaml @@ -0,0 +1,54 @@ +name: Anaconda Build + +on: ['push', 'pull_request', 'workflow_dispatch'] + +jobs: + anaconda_build: + name: Anaconda Build + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Create build environment + uses: conda-incubator/setup-miniconda@v3 + with: + activate-environment: gc-meox-tms-build + auto-update-conda: true + environment-file: conda/environment-build.yaml + python-version: 3.8 + - name: Show conda config + shell: bash -l {0} + run: | + conda info + conda list + conda config --show-sources + conda config --show + conda env list + - name: Python info + shell: bash -l {0} + run: | + which python + python --version + - name: Show environment variables + shell: bash -l {0} + run: | + env | sort + - name: Build the conda package + shell: bash -l {0} + run: | + export BUILDDIR=$RUNNER_TEMP/gc-meox-tms/_build + [ "$RUNNING_OS" = "Windows" ] && export BUILDDIR=$RUNNER_TEMP\\gc-meox-tms\\_build + conda config --set anaconda_upload no + conda build --numpy 1.18.1 --no-include-recipe \ + --channel bioconda --channel conda-forge \ + --croot ${BUILDDIR} \ + ./conda + - name: Upload package artifact from build + uses: actions/upload-artifact@v2 + with: + name: conda-package-artifact + path: ${{ runner.temp }}/gc-meox-tms/_build + retention-days: 1 diff --git a/.github/workflows/project_automation.yml b/.github/workflows/project_automation.yml new file mode 100644 index 0000000..93766fb --- /dev/null +++ b/.github/workflows/project_automation.yml @@ -0,0 +1,11 @@ +name: Add issues to project + +on: + issues: + types: + - opened + +jobs: + call-workflow: + uses: recetox/galaxytools/.github/workflows/add_issue_to_project.yml@master + secrets: inherit diff --git a/.github/workflows/publish_pypi.yaml b/.github/workflows/publish_pypi.yaml new file mode 100644 index 0000000..bd77da4 --- /dev/null +++ b/.github/workflows/publish_pypi.yaml @@ -0,0 +1,26 @@ +name: Publish to PyPI + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + python -m build --sdist --wheel + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1.8 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/python-package-conda.yaml b/.github/workflows/python-package-conda.yaml new file mode 100644 index 0000000..fa27e35 --- /dev/null +++ b/.github/workflows/python-package-conda.yaml @@ -0,0 +1,32 @@ +name: Python Package using Conda + +on: ['push', 'pull_request', 'workflow_dispatch'] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Add conda to PATH + run: | + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda update conda + conda env update --file conda/environment-dev.yaml --name base + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml new file mode 100644 index 0000000..f28e3da --- /dev/null +++ b/.github/workflows/python-package.yaml @@ -0,0 +1,68 @@ +name: Python Package + +on: ['push', 'pull_request', 'workflow_dispatch'] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml + - uses: actions/upload-artifact@v3 + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' + with: + name: coverage-report + path: | + coverage.xml + xunit-result.xml + + sonar-cloud: + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Pull coverage report + uses: actions/download-artifact@v2 + with: + name: coverage-report + path: ${{ github.workspace }} + - name: Adjust coverage source path + run: sed -i "s+$PWD/++g" coverage.xml + - name: SonarCloud Scan + if: env.SONAR_TOKEN != null + uses: sonarsource/sonarcloud-github-action@master + with: + args: > + -Dsonar.projectKey=RECETOX_gc-meox-tms + -Dsonar.organization=recetox + -Dsonar.host.url=https://sonarcloud.io + -Dsonar.python.version="3.10" + -Dsonar.sources=gc_meox_tms/ + -Dsonar.tests=tests/ + -Dsonar.python.coverage.reportPaths=coverage.xml + -Dsonar.python.xunit.reportPath=xunit-result.xml + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..027ec31 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.1] + +### Added +* documented functions with docstrings to make use of Python's built-in `help()` function [#23](https://github.com/RECETOX/gc-meox-tms/pull/23) +* added developer documentation [#23](https://github.com/RECETOX/gc-meox-tms/pull/23) +* added `Publish to PyPi` GitHub Actions workflow [#24](https://github.com/RECETOX/gc-meox-tms/pull/24) + +## [1.0.0] + +### Added +* added **Anaconda build**, **Python Package with pip** (inc. **SonarCloud**), and **Python Package with Conda** GH Actions. +* added test coverage for main and IO functionality +* added conda dev environment and conda meta.yaml recipe +* added CHANGELOG.md + +### Changed +* changed package structure +* divided main functionality, IO handling, and CLI into designated modules + +### Fixed +* fixed not working examples in IPython notebook diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index cdd1af7..0000000 --- a/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM ubuntu:20.04 - -USER root -ENV DEBIAN_FRONTEND=noninteractive -ENV TZ=Europe/Prague - -RUN apt update -RUN apt install -y python3-rdkit -RUN apt install -y python3-notebook -RUN apt install -y python3-pil - -RUN apt install -y python3-pip -RUN pip3 install py3dmol - -RUN apt install -y git -RUN mkdir -p /usr/local/share/jupyter/nbextensions && cd /usr/local/share/jupyter/nbextensions && git clone https://github.com/lambdalisue/jupyter-vim-binding vim_binding && jupyter nbextension enable vim_binding/vim_binding --sys-prefix - -COPY dist/*.tar.gz /tmp -RUN pip3 install /tmp/gc-meox-tms*tar.gz - -ENV HOME=/work diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..32ad627 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 RECETOX + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile deleted file mode 100644 index a4925f2..0000000 --- a/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -image=ljocha/gc-derivatization -port=9000 - -package-build: - python3 -m build - -docker-build: - docker build -t ${image} . - -docker-run: - docker run -p ${port}:${port} -u $(shell id -u) -w /work -v ${PWD}:/work ${image} jupyter notebook --ip 0.0.0.0 --port ${port} - -docker-bash: - docker run -ti -p ${port}:${port} -u $(shell id -u) -w /work -v ${PWD}:/work ${image} bash - - - diff --git a/README-package.md b/README-package.md deleted file mode 100644 index 13cf17a..0000000 --- a/README-package.md +++ /dev/null @@ -1,22 +0,0 @@ -# In silico derivatization - -Package to perform in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010): - -* Metoxymation: ketone R(C=O)R' and aldehyde karboxyl groups are substituted with C=NO[CH3] -* Trimethylsilylation: in -OH, -SH, -NH2, -NHR, =NH, the hydrogen is substituted with -SiMe3 - -The substitution needn't happen always, their probability currently hardcoded in the package. -Typically, multiple substitution attempts are run on each input molecule, and all distinct results are gathered. - -Known limitation is metoxymation on cycles which should be broken. This is not implemented yet. - -Package provides functions: -* `is_derivatized()` checks whether the molecule contains MeOX or TMS groups that are likely to be result of derivatization -* `remove_derivatization_groups()` removes the suspected groups, reconstructing the original molecule -* `add_derivatization_groups()` does the substitution above - -All the functions can accept either `mol: rdkit.Chem.rdchem.Mol` or `smiles: str` argument. All return `rdkit.Chem.rdchem.Mol`. - -The typical useage is wrapped in the `gc-meox-tms.py` driver script. - -See also https://github.com/ljocha/gc-derivatization for example use in Jupyter notebook. diff --git a/README.md b/README.md index f614506..b089a74 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,137 @@ # In silico derivatization -The main functionality is provided as a python package, see [README-package.md](README-package.md) +## Overview -The example Jupyter notebook reads a list of SMILES (text file, one molecule per line), and performs the derivatisation, also inspecting its results. +This package performs in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010): -The final outputs are two files: +* Methoximation: ketone R(C=O)R' and aldehyde (-HC=O) carbonyl groups +are substituted with -C=NOCH3 +* Trimethylsilylation: the acidic hydrogen in -OH, -SH, -COOH, -NH2, -NHR, =NH, the hydrogen is substituted with -Si(CH3)3 +The substitution doesn't always have to happen; its probability is currently hardcoded in the package. +Typically, multiple substitution attempts are run on each input molecule, and all distinct results are gathered. -* `derivs_struct.tsv` with columns (all SMILES): - * original - * with derivatization groups stripped - * column #2 derivatized (multiple times) according to the above rules -* `derivs_flat.txt` -- the above with all the smiles flattened, one per line +Known limitation is methoximation on cycles which should be broken. This is not implemented yet. +## Installation + +There are a few ways to install `gc-meox-tms`: + +1. Install in a new `conda` environment (recommended): +```shell +$ conda create -n gc-meox-tms -c bioconda gc-meox-tms +$ conda activate gc-meox-tms +``` + +2. Install from `pip`: +```shell +$ pip install gc-meox-tms +``` + +3. From source by cloning the repository and installing the package with `pip` as follows: +```shell +$ git clone https://github.com/RECETOX/gc-meox-tms.git + +# install the package: +$ python -m pip install gc-meox-tms + +# if you want to run examples in the Jupyter notebook, install with this command: +$ python -m pip install gc-meox-tms[eda] +``` + +## Usage + +### Command-Line Tool + +`gc-meox-tms` can be used as a command line tool to produce all MeOX/TMS derivatives of given compounds. To use it via +the command line you will need one or more `txt` files with chemical compounds represented as SMILES +(one SMILES per line). The tool can output results in flat `txt` format(one compound per line) or tab separated `tsv` +format (all derivatives of a given molecule per line). +```shell +$ python -m gc_meox_tms \ +-f \ +-t \ + +``` +More parameters can be specified, such as number of cores or repeats. For more information run: +```shell +$ python -m gc_meox_tms --help +``` + +### Python Package + +Package provides functions: +* `is_derivatized()` checks whether the molecule contains MeOX or TMS groups that are likely to be result of derivatization +* `remove_derivatization_groups()` removes the suspected groups, reconstructing the original molecule +* `add_derivatization_groups()` does the substitution above + +```python3 +from gc_meox_tms import add_derivatization_groups, is_derivatized, remove_derivatization_groups +from rdkit.Chem import MolToSmiles + +# Example compounds in SMILES format +compounds = ["CC=O", "CC=NOC", "CCO[Si](C)(C)C"] + +# Check derivatization +[is_derivatized(smiles=smiles) for smiles in compounds] +>>> [False, True, True] + +# Remove derivatization groups from derivatized molecules +underivatized = [remove_derivatization_groups(smiles=smiles) for smiles in compounds[1:]] +print([MolToSmiles(mol) for mol in underivatized]) +>>> ["CC=O", "CCO"] + +# Convert molecules back to derivatized forms +rederivatized = [add_derivatization_groups(mol=mol) for mol in underivatized] +print([MolToSmiles(mol) for mol in rederivatized]) +>>> ['CC=NOC', 'CCO[Si](C)(C)C'] +``` +Note that your results may differ from the presented since `add_derivatization_groups` is not deterministic. If you rerun +the function enough times you will get all possible derivatizations. The number of reruns to obtain all possible conformations +is individual for each compound (depends on possible conversion degrees etc.). + +See also the Jupyter notebook in `example/` directory for more examples. + +## Developer documentation + +### Installation +Create a virtual environment of your choice (e.g., conda or venv). +The development version can be installed with conda or pip as follows: +```shell +# 1. Fork and clone the repository +$ git clone https://github.com//gc-meox-tms.git +$ cd gc-meox-tms + +# 2a. To create a conda env run from the package directory: +$ conda env create -f conda/environment-dev.yaml +$ conda activate gc-meox-tms-dev + +# 2b. Alternatively, install using python venv: +$ python3 -m venv gc-meox-tms-dev +$ source gc-meox-tms-dev/bin/activate +$ pip install -e .[dev] +``` + +### Contributing +Before opening a PR make sure all the tests are passing by running `pytest` from within the package directory: +```shell +$ pytest +``` +It may happen that some tests which are dependent on probabilistic logic may fail. If that occurs, +try rerunning the tests. Usually one rerun is enough. + +We strongly advise you to add new tests for the functionality that you want to contribute. If you want to check whether +your changes are covered with tests, run `$ pytest --cov` and examine the output to see what parts may need better test coverage. + +Run linter, to make sure all is nicely formatted: +```shell +$ flake8 + +# if you use venv, exclude venv directory from linting +$ flake8 --exclude 'gc-meox-tms-dev' +``` + +Lastly make sure the Python imports are in a proper order: +```shell +$ isort gc_meox_tms +``` diff --git a/bin/gc-meox-tms.py b/bin/gc-meox-tms.py deleted file mode 100755 index 2f314f0..0000000 --- a/bin/gc-meox-tms.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import fileinput - -from gc_meox_tms import remove_derivatization_groups,add_derivatization_groups -from concurrent.futures import ProcessPoolExecutor -from rdkit import Chem - -def process_one_mol(n_mol): - mol,n = n_mol - return ( - mol[0], - Chem.MolToSmiles(remove_derivatization_groups(mol[1])), - { Chem.MolToSmiles(add_derivatization_groups(mol[1])) for _ in range(n) } - ) - -def doit(): - p = argparse.ArgumentParser() - p.add_argument('-n','--ncpu',type=int,action='store',help='# of cores to use',default=1) - p.add_argument('-r','--repeat',type=int,action='store',help='# of repeated attempts to derivatize (may return different results)',default=42) - p.add_argument('-k','--keep',action='store_true',help='keep input and stripped derivatization SMILES in output',default=True) - p.add_argument('-f','--flat',type=str,action='store',help='flat output file, one SMILES per line') - p.add_argument('-t','--tsv',type=str,action='store',help='structured output tsv file (original, stripped derivatization, added derivatizations') - p.add_argument('infiles',nargs='+',type=str,action='store',help='input files') - - opt = p.parse_args() - - insmi = list(filter(lambda p: p[1], [(line.rstrip(),Chem.MolFromSmiles(line)) for line in fileinput.input(files=opt.infiles)])) - n_mols = list(zip(insmi, [opt.repeat] * len(insmi))) - - with ProcessPoolExecutor(max_workers=opt.ncpu) as executor: - out = executor.map(process_one_mol,n_mols) - - if opt.flat: - with open(opt.flat,"w") as flat: - if opt.keep: - for orig,removed,added in out: - for one in { orig, removed, *added }: - flat.write(one + "\n") - else: - for orig,removed,added in out: - flat.write("\n".join(added) + "\n") - - if opt.tsv: - with open(opt.tsv,"w") as tsv: - tsv.write("orig\tderiv. removed\tderiv. added ...\n") - for orig,removed,added in out: - tsv.write("\t".join([orig,removed,*added]) + "\n") - - - -if __name__ == '__main__': - doit() diff --git a/conda/environment-build.yaml b/conda/environment-build.yaml new file mode 100644 index 0000000..070f4bc --- /dev/null +++ b/conda/environment-build.yaml @@ -0,0 +1,8 @@ +name: gc-meox-tms-build +channels: + - conda-forge + - defaults +dependencies: + - conda-build + - conda-verify + - python >=3.8 \ No newline at end of file diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml new file mode 100644 index 0000000..9fc8b89 --- /dev/null +++ b/conda/environment-dev.yaml @@ -0,0 +1,11 @@ +name: gc-meox-tms-dev +channels: + - conda-forge + - defaults +dependencies: + - python >=3.8 + - pip + - rdkit + - pip: + - -r ../requirements.txt + - -r ../requirements-dev.txt diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 0000000..b5595a0 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,41 @@ +{% set name = "gc-meox-tms" %} +{% set version = "1.0.1" %} + +package: + name: {{ name|lower }} + version: {{ version }} + +source: + path: .. + +build: + number: 0 + noarch: python + script: {{ PYTHON }} -m pip install . + +requirements: + host: + - python >=3.8 + - pip + run: + - python >=3.8 + - rdkit + +test: + imports: + - gc_meox_tms + +about: + home: https://github.com/RECETOX/{{ name }} + license: MIT + summary: In-silico MeOX/TMS derivatization of chemical compounds + description: This package performs in-silico methoximation (MeOX) and trimethylation (TMS) of chemical compounds + from SMILES strings or RDKit molecule objects. It also can identify whether a given compound is already derivatized + by MeOX or TMS method. + dev_url: https://github.com/RECETOX/{{ name }} + +extra: + recipe-maintainers: + - RECETOX/conda + - hechth + - maximskorik \ No newline at end of file diff --git a/derivatization.ipynb b/derivatization.ipynb deleted file mode 100644 index e9e0ede..0000000 --- a/derivatization.ipynb +++ /dev/null @@ -1,425 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In-silico derivatization\n", - "\n", - "The notebook reads a list of SMILES (text file, one molecule per line), and performs in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010):\n", - "\n", - "* Metoxymation: ketone R(C=O)R' and aldehyde karboxyl groups are substituted with C=NO[CH3]\n", - "* Trimethylsilylation: in -OH, -SH, -NH2, -NHR, =NH, the hydrogen is substituted with -SiMe3\n", - "\n", - "The probability of all the substitutions can be adjusted, they needn't happen always. Multiple substitution attempts are run on each input molecule, and all distinct results are returned.\n", - "\n", - "Known limitation is metoxymation on cycles which should be broken. This is not implemented yet.\n", - "\n", - "The final outputs are two files:\n", - "\n", - "* `derivs_struct.tsv` with columns (all SMILES):\n", - " * original\n", - " * with derivatization groups stripped\n", - " * column #2 derivatized (multiple times) according to the above rules\n", - "* `derivs_flat.txt` -- the above with all the smiles flattened, one per line\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import what we need and setup the environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from rdkit import Chem\n", - "from rdkit.Chem import AllChem\n", - "#from rdkit.Chem.Draw import IPythonConsole\n", - "from copy import deepcopy\n", - "import random\n", - "\n", - "#IPythonConsole.drawOptions.addAtomIndices = True\n", - "#IPythonConsole.molSize = 200,200\n", - "\n", - "random.seed(42)\n", - "\n", - "import multiprocessing\n", - "from concurrent.futures import ProcessPoolExecutor\n", - "cpus = multiprocessing.cpu_count()\n", - "print('# cpus (including HT, typically): ', cpus)\n", - "\n", - "# don't run on HT cores, it just makes congestion\n", - "cpus //= 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import our payload\n", - "\n", - "from gc_meox_tms import is_derivatized, remove_derivatization_groups, add_derivatization_groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Utility function for 3D rendering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import py3Dmol\n", - "\n", - "def draw3d(m,dimensions=(500,300),p=None):\n", - " AllChem.EmbedMultipleConfs(m, clearConfs=True, numConfs=50)\n", - " opt = AllChem.MMFFOptimizeMoleculeConfs(m)\n", - " conf = min(range(len(opt)),key = lambda x: opt[x][1] if opt[x][0] == 0 else float(\"inf\") )\n", - " \n", - " mb = Chem.MolToMolBlock(m,confId=conf)\n", - " if p is None:\n", - " p = py3Dmol.view(width=dimensions[0],height=dimensions[1])\n", - " p.removeAllModels()\n", - " p.addModel(mb,'sdf')\n", - " p.setStyle({'stick':{}})\n", - " p.setBackgroundColor('0xeeeeee')\n", - " p.zoomTo()\n", - " return p.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Simple checks on manual inputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for s in ['CCC(=NOC)C', 'CCC=NOC', 'C=NOC', 'CSi(C)(C)C']:\n", - " print(s,is_derivatized(smiles='CCC(=NOC)C'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "remove_derivatization_groups(smiles='CCC(=N)C')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "m=Chem.MolFromSmiles('CCC=NOC')\n", - "remove_derivatization_groups(m)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "remove_derivatization_groups(smiles='C[Si](C)(C)OCCCO[Si](C)(C)C')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "m=remove_derivatization_groups(smiles='CON=CC(O)C=NOC')\n", - "m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "add_derivatization_groups(m)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read the input file\n", - "\n", - "The file is parsed line by line, errors are reported and ignored otherwise. \n", - "\n", - "The result is `mol[]`, a list of pairs (_original SMILES_, _RDKit molecule_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#smi_file='NIST_Si_100.txt'\n", - "#smi_file='NIST_Si_all.txt'\n", - "#smi_file='NIST_SMILES.txt'\n", - "smi_file='NIST_195_200.txt'\n", - "with open(smi_file) as f:\n", - " mols = list(filter(lambda p: p[1], [ (smi.rstrip(), Chem.MolFromSmiles(smi)) for smi in f ]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Essential statistics\n", - "\n", - "Count occurrences of (one-),di-,tri-methylsilane, TMS attached to -O, -N, -S, and methoximine. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SiMe1=Chem.MolFromSmarts('[Si][CH3]')\n", - "SiMe2=Chem.MolFromSmarts('[Si]([CH3])[CH3]')\n", - "SiMe3=Chem.MolFromSmarts('[Si]([CH3])([CH3])[CH3]')\n", - "ONSSi=Chem.MolFromSmarts('[O,N,S][Si]([CH3])([CH3])[CH3]')\n", - "\n", - "print('# total',len(mols))\n", - "with_sime1 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe1),mols))\n", - "print(\"# with SiMe:\", len(with_sime1))\n", - "with_sime2 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe2),mols))\n", - "print(\"# with SiMe2:\", len(with_sime2))\n", - "with_sime3 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe3),mols))\n", - "print(\"# with SiMe3:\", len(with_sime3))\n", - "with_onssi = list(filter(lambda m: m[1].HasSubstructMatch(ONSSi),mols))\n", - "print(\"# with ONSSi:\", len(with_onssi))\n", - "\n", - "MeOX=Chem.MolFromSmarts('C=NO[CH3]')\n", - "with_meox = list(filter(lambda m: m[1].HasSubstructMatch(MeOX),mols))\n", - "print(\"# with MeOX:\", len(with_meox))\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inspect whatever from the sorted categories" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with_sime2[70][1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "draw3d(with_sime2[70][1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "draw3d(with_onssi[52][1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with_meox[4][1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('NIST_ONSSiMe3.txt','w') as f:\n", - " for m in with_onssi:\n", - " f.write(m[0]+'\\n')\n", - " \n", - "with open('NIST_SiMe3.txt','w') as f:\n", - " for m in with_sime3:\n", - " f.write(m[0]+'\\n')\n", - " \n", - "with open('NIST_MeOX.txt','w') as f:\n", - " for m in with_meox:\n", - " f.write(m[0]+'\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test_smi='CCO[Si](C)(C)C'\n", - "#test_smi='C[Si](C)(C)OCC-N[Si](C)(C)C'\n", - "#test_m = Chem.MolFromSmiles(test_smi)\n", - "test_m = with_onssi[35][1]\n", - "Chem.AddHs(test_m)\n", - "test_m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_n = remove_derivatization_groups(test_m)\n", - "Chem.AddHs(test_n)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_d = add_derivatization_groups(test_n)\n", - "test_d" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "draw3d(test_d)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the in-silico derivatization\n", - "\n", - "Iterate over the `mol[]` list (read from file above), remove derivatization groups from each entry, and try derivatization several times to leverage from the probabilistic behaviour). Assemble the results.\n", - "\n", - "This can be time consuming, expect about 5,000 entries per minute per core. Memory consumption can also grow to several GB." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "def process_one_mol(mol):\n", - " return (\n", - " mol[0],\n", - " Chem.MolToSmiles(remove_derivatization_groups(mol[1])),\n", - " { Chem.MolToSmiles(add_derivatization_groups(mol[1])) for _ in range(42) }\n", - " )\n", - " \n", - "with ProcessPoolExecutor(max_workers=cpus) as executor:\n", - " out = executor.map(process_one_mol,mols)\n", - " \n", - "out = list(out)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Write the main outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('derivs_struct.tsv','w') as tsv:\n", - " tsv.write(\"orig\\tderiv. removed\\tderiv. added ...\\n\")\n", - " for orig,removed,added in out:\n", - " tsv.write(\"\\t\".join([orig,removed,*added]) + \"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('derivs_flat.txt','w') as flat:\n", - " for orig,removed,added in out:\n", - " for one in { orig, removed, *added }:\n", - " flat.write(one + \"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/example/NIST_195_200.txt b/example/NIST_195_200.txt new file mode 100644 index 0000000..4f0e1f4 --- /dev/null +++ b/example/NIST_195_200.txt @@ -0,0 +1,4 @@ +CC(=O)N([Si](C)(C)C)[Si](C)(C)C +CCC(=NOC)C +CCC=NOC +C=NOC \ No newline at end of file diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..dec97c3 --- /dev/null +++ b/example/README.md @@ -0,0 +1,13 @@ +# In silico derivatization example + +The example Jupyter notebook `derivatiozation.ipynb` reads a list of SMILES (text file, one molecule per line), and performs the derivatisation, also inspecting its results. + +The final outputs are two files: + +* `derivs_struct.tsv` with columns (all SMILES): + * original + * with derivatization groups stripped + * column #2 derivatized (multiple times) according to the above rules +* `derivs_flat.txt` -- the above with all the smiles flattened, one per line + + diff --git a/example/derivatization.ipynb b/example/derivatization.ipynb new file mode 100644 index 0000000..2367adc --- /dev/null +++ b/example/derivatization.ipynb @@ -0,0 +1,703 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# In-silico derivatization\n", + "\n", + "The notebook reads a list of SMILES (text file, one molecule per line), and performs in-silico MeOX + TMS derivatization (as described e.g. in https://doi.org/10.1021/acs.analchem.7b01010):\n", + "\n", + "* Methoximation: ketone R(C=O)R' and aldehyde (-HC=O) karboxyl groups are substituted with -C=NOCH3\n", + "* Trimethylsilylation: the acidic hydrogen in -OH, -SH, -COOH, -NH2, -NHR, =NH, the hydrogen is substituted with -Si(CH3)3\n", + "\n", + "The probability of all the substitutions can be adjusted, they needn't happen always. Multiple substitution attempts are run on each input molecule, and all distinct results are returned.\n", + "\n", + "Known limitation is methoximation on cycles which should be broken. This is not implemented yet.\n", + "\n", + "The final outputs are two files:\n", + "\n", + "* `derivs_struct.tsv` with columns (all SMILES):\n", + " * original\n", + " * with derivatization groups stripped\n", + " * column #2 derivatized (multiple times) according to the above rules\n", + "* `derivs_flat.txt` – the above with all the smiles flattened, one per line\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Import what we need and setup the environment" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# cpus (including HT, typically): 24\n" + ] + } + ], + "source": [ + "from rdkit import Chem\n", + "from rdkit.Chem.Draw.IPythonConsole import ShowMols\n", + "import random\n", + "\n", + "random.seed(42)\n", + "\n", + "import multiprocessing\n", + "from concurrent.futures import ProcessPoolExecutor\n", + "cpus = multiprocessing.cpu_count()\n", + "print('# cpus (including HT, typically): ', cpus)\n", + "\n", + "# don't run on HT cores, it just makes congestion\n", + "cpus //= 2" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# import our payload\n", + "try:\n", + " import py3Dmol\n", + "except ImportError as e:\n", + " !pip install py3dmol\n", + "\n", + "from gc_meox_tms import is_derivatized, remove_derivatization_groups, add_derivatization_groups, process_one_mol\n", + "from plotting import draw3d" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Simple checks on manual inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CCC(=NOC)C True\n", + "CCC=NOC True\n", + "C=NOC False\n", + "C[Si](C)(C)C False\n" + ] + } + ], + "source": [ + "for s in ['CCC(=NOC)C', 'CCC=NOC', 'C=NOC', 'C[Si](C)(C)C']:\n", + " print(s, is_derivatized(smiles=s))" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'CCC(=N)C'\n", + "ShowMols(\n", + " [\n", + " mol := Chem.MolFromSmiles(smiles),\n", + " remove_derivatization_groups(mol=mol)\n", + " ],\n", + " legends=['Original molecule', 'Try remove deriv group'])" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'CCC=NOC'\n", + "ShowMols(\n", + " [\n", + " mol := Chem.MolFromSmiles(smiles),\n", + " remove_derivatization_groups(mol=mol)\n", + " ],\n", + " legends=['Derivatized', 'Derivatization group removed'])" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'C[Si](C)(C)OCCCO[Si](C)(C)C'\n", + "ShowMols(\n", + " [\n", + " mol := Chem.MolFromSmiles(smiles),\n", + " remove_derivatization_groups(mol=mol)\n", + " ],\n", + " legends=['Derivatized', 'Derivatization group removed'])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAIAAAC7/QjhAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3deVyM+R8A8M9MJ0UqXY4OQlKREiqhsq6k1IjSWvw2x5J77bpid13LrrKs1S4r13bIUcgdIpKQ5MiRpJRKpXOmms/vj+8YOZaoZmzzeb+8vHqO+T6fmXnm+Tzf43keDiICIYQQIqu40g6AEEIIkSZKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiVCQgghMo0SISGEEJlGiZAQQohMo0RICCFEplEiJIQQItMoERJCCJFplAgJIYTINEqEhBBCZBolQkIIITKNEiEhhBCZRomQEEKITKNESAghRKZRIiSEECLTKBESQgiRaZQICSGEyDRKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiVCQgghMo0SISGEEJlGiZAQQohMo0RICCFEplEiJIQQItMoERJCCJGg0lIoKZF2EK+Rl3YAhBBCZENSEvzvfyAvDwBQXQ1//QVWVtKOCQCAg4jSjoEQQkhTJxBAt27wyy/g6goAcOAAzJ8PqamgqCjtyKhplBBCiATEx4OamigLAoCbG7RsCRcuSDUmEUqEhBBCGl9GBhgavjanQwfIyJBOMK+jPkJCCCGNr3VrKCp6bU5hIbRuDRs2wJMnMGYM9Owppcioj5AQQogE5OaCmRmkpICuLgBATg6Ym0NqKtjbw717AACGhuDqCuPHSz4jyi1btkzCmySEECJDCguhqAh0dUFBAaZPB6EQEhLgm29gzhxwdAQrK1BRgceP4ckTSEiA4GDYvx8KC6FtW1BXl0yAVCMkhBDSaNLTYfhwkJeHuDhQU4MbNyA+HgDA1hYsLF6tVlMDZ8/CP//Avn3w/Llopo3Ni3HjXri7t2vXrlFjpERICCGkcSQmwogRkJsLFhZw9Cjo6X34JTU1cPEi7NwJoaHw4sVmB4fp58/37duXx+N5eXnpsmbVhkaJkBBCSCM4ehRGj4aSEnB2hr17QU3t415eUQFHjvwYG7tq27aKigoAkJOTc3JyGjNmjLu7e6tWrRowUkqEhBBCGtqff8K0aVBdDRMmwJYtoKDwySVVVFScPHly586dBw8eFAgEACAnJzdw4EBfX193d/cWLVrUP1hKhIQQIhN27Nhx+PBhR0dHHx8fVVXVRtqKUAhVi5crrVoGHA78+CMsWtRQJRcWFu7bty8sLOz06dM1NTUAoKamlpOTo6ysXM+SKRESQkjT9/fff0+cOFFOTq6mpkZZWdnZ2ZnH43l4eKioqDTgVvh8GD8elFMS/84ezPltA4wb14CFiz1//vzQoUM7d+5s3rz5wYMH618gJUJCCGnifvzxx4CAAEQ0NDRs165dfHy8UCgEgJYtW7q5uY0ZM8bZ2VmhHq2XTF4euLrCpUugrg7nDhaa9Wv0ix+qq6vl5RvgtjCUCAkhpMmqrq6ePn36li1b5OTkAgMDp0+fDgBPnjyJjIyMiIiIj49nKUBdXd3FxYXH4w0dOvTTUsuDBzBsGKSlgZERHD4MXbs28BtpVJQICSGkaSotLR09enRMTIyKisqePXtcXV0BoLy8vHnz5myFjIyMAwcOREREXLhwAQA4HG63bhW2toq+vmBnBxxOXTd08SKMHAl5edCrF0RHg45O47yfRkOJkBBCmqDs7GwXF5dr167p6upGR0dbW1uz+T4+PvHx8V5eXuPHj+/6suKWmpoaGhp65Yr80aMBbI6BAXh5wZgxYGn5gQ3t3Qu+vlBZCa6u8M8/8DLJ/pdQIiSEyKiysrKGHSrybnl5sGkTJCeDpiYMGQKeno2+RYCUlJThw4dnZmaampoePnzY8OVjH4RCYefOnR88eMAme/ToMWbMGC8vL/EKqakQEQG7dsHLVUR3AP3qK1FG/O47SEmB6GjgcgEAzMwgKAiGD4fx42HTJmiIDjtpQEIIkT179+7V0dG5fv16426mshJNTHDZMkxNxdhY7N4dN21q3C0iHj9+vGXLlgBgZ2eXl5f3xtKampq4uDh/f38tLS1xIjA1NV29enVWVhZbRyjEuDicPh11dBBA9M/CArdswUmTUFsb//hDVFrz5sjnY2pqY7+nxkWJkBAii8aOHQsA+vr64qN/o9i6FV1cXk2mpmLr1igUNuYGt7Lxnzwer6Ki4j1r8vn8qKio2tcUtm5t4ugo/OMPFGfPmhqMi0N/f9TWRgCcNw8nTcK1a7F9e8zJQXyZCP/rqGmUECKLBALBkCFDYmNjzc3N4+Li1D72BmB1NGsW6OjA99+/mqOhASkp0LZtg28KEZcvX758+XIA8Pf3DwwM5NRtuEtFRcWhQ4dCQ0OLisacPs0DADk56NMHeDzw8YHWrQEABAI4fhw6d4affwZHR8jMhJQU2LULVFSgsBAUFRv83UiWtDMxIYRIR0FBQZcuXQBg6NChVVVVjbKN6dNxzZrX5mhp4aNHGBuLz5414HYqKyt9fHwAQF5e/g9xw+VHKivD8HB0cUFFRVFzqJISurhgSAi+eCFaZ9Ik3L0b+Xw0NcUzZ5pIjZAr7URMCCHSoaGhERMTo6OjExMTM3Xq1EbZhokJXLv2ajIjA/h80NUFDw/Q0wN7ewgKgvz8em6ksLBw8ODBu3fvVlVVjYqKmjx58qeV07w58HgQHQ1Pn0JwMDg6QnU1HDoE48eDnh6Ehr5aU1ERNm2COXOgaTQpUiIkhMguIyOj6OhoFRWVv/76a82aNQ2/gS+/hPPnYc8eqKqC7GyYPBnmzoWCArC1BTk5uHABZs2Cdu3AzQ3++QfKyj5hC+np6ba2tmfPnm3Tps25c+eGDh1a/6g1NODrr+HUKcjNhZAQcHaG8nIwM3ttnQEDoGtXqKio/9Y+A9KukhJCiJRFRUXJyclxOJydO3c2TImlpejmhgcOICLeuoVjx6KFBdrb4+rVWF0tWqewEENC0MUFFRREDZHKyqKGyNLSOm4nISFBR0cHAMzNzR8/ftwwwb9LZqboj5AQTEgQ/f30Kfr5YSM1KktSow2WQYSzZyE5GVq0gIEDwcioUbbyXrGxsd9+++2ePXs6deok+a1/vh4+hDNnoKQEevQAB4ePuHsEIU3X+vXr58yZo6ioePTo0YEDB9arrJwccHGBpCQwMIC0tA+PJMnNhYgICA2F+HjW1PjL4MEpurpeXl7vvwXogQMHfHx8ysvLBw0atHfvXnbJBPkUjZVhfXywf3/csgVXrEA9PYyObqwNvcv9+/fd3NzYG5w6daokN/25i47GNm1w1SrcsgX79cNx46QdECGfC39/fwDQ0NC4c+fOp5eSmooGBgiAHTrgx5bz+DGuXYvW1l07dmSHL3V1dV9f36ioqLfH8gQGBnK5XACYOHGiQCD49IBJY11HGBuLJiavWgDi4tDQEGtqGmVbrystLQ0ICGCPp1JRUQkICHj/lTSypboaDQwwPl40WVWFnTrhmTNSjUm2xcbiypW4ciWePi2ak5iIx469WiE6GpOTpRKaDKqpqXF3dwcAIyOjHHaV3Mc6dQpbtUIA7NMHc3M/OZJHjx4FBgba2dmJayyampp+fn5xcXFCoVAoFC5YsAAAOBwOe6YEqafGSYQ//IDz5r02R0sLMzIaZVsvCYXC8PDw9u3bs/2Dx+NlZGQgokAgmDp1qru7++7duxs1gA/avXu3u7v71KlTpXb6lp6OOjqvzZkzB3/6STrBkMWL0doad+zAkBC0tMTFixERg4Jw+vRX64wfj1u3SitAGVReXt6nTx8AsLa2Lq1zRx0jDAkRXXbg6YkNdP6dkpKyePFiY2NjcUY0MDDo1q0bACgpKe3Zs6dBtkIaJxHOmYM//PDaHGNjvHatUbaFiIiXL1+2tbVlO0qvXr0uXLjA5p84cYLtNCw7+vr6Pn36tPHC+Df5+fn+/v7ii1s7deoUHh4u+TDw6lXs1Om1OcuX45w5UoiEZGSglhYWFoomCwpQUxMzMykRSl1eXh5LPC4uLtXiZq0PCQwMnNivHwKgv39jtH7dvHkzICCgY8eOLBeqqKicobachtPQibC6GqurccMGHD/+1czycmzWDIuKsBFqQllZ6OeH/ftPA4A2bdps2bKlpqYGEW/fvj1kyBBx4unSpQvrc1ZTU1u7di1fUpeA8vn8tWvXsptWKCgodOnSRTxyZ8iQIbdv35ZMGIiIAgE+f47Nm792rurrixs3ir41Ikn79+OQIa/N+eILPHAAg4Jw2DDct0/0b+BASoSSd/v2bQ0NDQCYPXv2B1cWCAQTJkwAADk5ucTt2xs1MKFQuHHjRgDo0aNHo25I1jRoIjxzBrt3x82b8elT1NbGGzdE85csQU9PFAqxf3/09cUGqpOVl+MPP6CKCgJgmzb5S5YsY00ZhYWFCxYsUFRUBIBWrVqtXr26srISEdPS0ng8HktCxsbGEqiTnThxwtTUlG3R2dn55s2biCgQCLZs2dK6dWsAkJeX9/Pze/uuuA0sPx/9/dHeHoVCdHdHcadCcjJqa2NuLv7+O3btikePNm4YpLaQEPTweG3OqFG4YwcGBaGVFX7/veifuTklQqk4d+6ckpISAAQFBb1ntZKSEnbdnoqKSlRUlAQCKy4u5nK5SkpKNECmATVQIkxPR09P0aUw1taIiFFRaGKCffuimRl+8QU+fYrXr4sa0Fu2xDVr6nlbnqgoNDISbdDFBR8+RESsqakJCQnR1tYGAC6X6+vrm/tWf/XJkyfNXl4X6uTkdEOcrRvU3bt3hw8fzrbSuXPnQ4cOiRcJhUJELCgo8Pf3l5OTY6PUAgMDG+UOT3w+rlmDLVsiACoq4vXrmJ2Nzs5oZoZ9+2LXrsgCs7YWfZSenpie3vBhkLfFxaGZ2WtzTE3xwgVqGv18hIaGcjgcLpe7b9++d66QlZXVo0cPANDV1U1MTJRYYJ07dwaARn9uhiypdyIsK8OAAFRWRgBs3hwDArC8/NXS/PzXEt69e8jjiY65xsb4SXWyq1fRwUFURo8er8Y8xsaetbCwYLlnwIAB79lLqqqq3qiTPWu4m/49f/78jfpo7WbYhw8fGhsbh4SEsHR4/fr1AQMGsJgtLCxiY881VBiIiCdOoKmp6JNydn7tQSl8PubnvzYZGIgtWojypb//qxsLkkZSXY3m5vj336LJv/7CHj2wpoYS4Wflp59+UlVVPXz48NuLrl+/3qZNGwAwNTV99OiRJKMaM2YMAGzbtk2SG23a6pEIhUIMDxddMcPhII+HddwbTp1Cc3PRAdrREetcJ2MtfHJyCICamhgYKOrYevwYfX2xX78tANC+fXtxmnk/VieTl5dvqDpZXeqjc+fOfTtVR0VFdejQAQD69bvt4oIPHtQnCkREvHMHhw0TfcJduuC7fsbvwLpbuVzW1oxbtkjmihfZdecODhyIpqZoaooDB+Ldu4iImzfj3Lmv1pk8GXfskFaABBHZ4POqqqr9+/f//vvvx48fFwgEoaGh8fHxOjo6AwcOLBSPeJIUdiu4GTNmSHi7TdinJsIrV9DOTnSotbLCuLiPe3lVFW7YgBoaCHCu58wZM7Cg4MMvOnUKAVBBAf39sagIEbGkBBcuFFVHW7asXrdu88deNVh7TI2JiUlMTMzHvZGXTp8+XZf66NvJkl2uVFFRsW7dNtaEqayMCxdiScmnhFFQgDNmYGFPRwRADQ3csOGjb3+UmIi2tq9auV+OvyWNJS8PG7uTmNSbs7Ozl5dXYGDg119//fTp071792ZlZd25c0diw+5qO378OADY2dlJftNN1UcnwqysLP///U+ooYEAqKeH27d/+kMm8/Nx2rRe3cpYDW/jxg8ftFesEN2rgVVH9fVfVUfrc5miuE7Gxkzfv3+/7q99/Pixr68vey2rj37wJWw4D+uKrz2cp3aVTE/v46pkVVW4cSNqaiIAjut2FadNe63x86MIhbh9O+rpIUCBhsaU//2vcZ9cKuPYOQf5jOXn5ysqKta+lGL//v3Z2dnSiqegoIDD4aiqqtZQm00D+YhfYEVFxcqVK9mzjMMcHfG77z6x2vK627dx6NBXzXhHjnz4JW9UWs6fr38UyOfzAwMD2c36FBQU/P39i4uL3/+S2nexad68+cfexeatyuhJNj8x8bXKdl3e3enTaGEhesnAgQ10K5KSEvzuu5mOjgCgqqq6cuVKukdPo6BE+NkTCATt27f38/O7y5qvEZ2cnM43yHHnUxkYGACARK+/atLq+gt8o870oAE6st4oHzt2fDUK9P59zMnBvXtfVYkuX8ZHj+pVZ6qL7OxsPz8/dgc/PT098VWJb2B3sdHX13/jLjafQHzJf79+t52d8eZNVv6b9d1Hj/DRIzx48NULz53DnBxR/yj73Nq3xzpURz/OJ9R3ycehRPhfkJmZOWfOnLZt244cObKsrEzqiZDdCk7qd8tqMj78C2yoXrQPqqzEVatEQxeVlHDrVgTAjRtFSydMwE2bREvr04tWF1euXBHf5c/Kyiru9R7QxMRE8V1srK2tL9S7F00gEPz2207W2KyggHPmvKMHtEUL3LQJATAiQvSqoUNx61ZUUhItXbUKKyvrGci/qmMPKPkUlAj/O/h8vp2dXXBwsNQT4Q8//AAA8964kyX5VO/7BTb4uMq6yM5GPz8cPBiPHUN7ezQwQNYUP2EC7tiB48djw4yr/BBW52PtD6zO9+jRo6ysLHF9sfZdbBpEQcGrMbEaGm+OiWVvf9Ag7NABWZPt0KF47BgOG4Y8HjbmY8hE6jImlnwKSoSfPaFQKL7vqKur659//in1RHjo0CEAcHR0lGIMTcm//gKTk5PV1dXZlXb+/v7Pnz+XZFhVVXjsGI4cievXo5cX4stEKOEnQJaWli5atIj1AiorK4v/WLRo0cfekLeOrl1791WSVVW4Ywf6+eF33+HMmYgvE6GEP5Dnz5+Lz43U1dWT6cEI9UeJ8DO2b9++BQsWpKend+rUadCgQT179hwwYEBpaemwYcMuXrwoxcBycnLYULu6XCpGPuhff4HV1dXm5uaNd++VD2KJsKoKzc3x2DFRIpSKzMxMX19fFRUVFRWVxugffVtYmOj6TADk8UTDP1kiLC1FIyO8elWUCKWC3TfH1NRUAi0ETR8lws/V+vXrWfPPyZMnhUJhVlZW7Wcz8fn8OXPmZIof3C5xenp6ACCBw5Es+EDTqMTieBtLhIgYF4empjhunJQvLGY9ZBLbXHk5rl6NLVpghw6iu2SzRIiIkZHYrx8OGSK1RMhId/doOigRfn6qq6vZQ3rf88C/b7/9FgC6d+/+weHljYTdxDFCPGqA1AMX/h27/7rU2duDjQ0cPCjtOCSrWTNYsABu3YI9e0BZ+bVFo0aBigpcuCClyF76THYPQhpWZWWlt7f3hg0blJSUdu3atWzZsneutnDhQjMzs+TkZDc3N4FAINkYAQB69uwJANeuXZP8ppue9yXCz8e6daCoKO0gpKFdO+jd+x3zN26EqiqJR0NIU1dQUODs7BweHq6urn7s2DFvb+9/W1NNTS06OlpHRyc2Nnbq1KmSDJKxtLQEgKtXr0p+000PB182+n1uqqqAzwdVVdFkaSkoKkozHbLH6krx4xIIoLoamjcXTZaUgLIyKChIKxzSMJ727g0AegkJ0g6EwP3794cNG3bv3j0jI6MjR46YmJh88CVJSUn9+/cvKytbsWLFwoULJRCkWEZGhqGhoba2dm5uriS32yR9vonwcyP1REiaJNqvPhPx8fEjR47Mz8+3sbGJiorS0dGp4wsPHz48cuRIoVAYEhIivvuEZGhpaeXn52dmZrZr106S2216/htNo4QQ0nj27t3r5OSUn5/v5uYWGxtb9ywIAMOHD9+0aRMiTpo06dSpU40X5NvY0xCpm7D+KBESQmRaUFCQl5dXZWWlv79/ZGRkc3H3Q51Nnjx51qxZVVVVHh4eN2/ebIwg34mNl6FuwvqjREgIkVE1NTVTp06dNWsWh8MJCgoKCgpiFw5+gl9++WXUqFHFxcUjRoyQWKcdGy9DNcL6o0RICJFFpaWlI0eO/OOPP5SVlffs2cMuHPxkXC53165dffv2ffTo0fDhw8vKyhoqzvd4u0a4devW+Ph46nL+WDRYpq5oUEOTsmED/Pkn1NQAAIwfD99+CxyOVAKR8n6FCGvWwM6dLBSYPBlmzJBOJJKVnZ09YsSIq1evampqHjx4UHyT/XrKz8+3tbW9d+/e8OHDDx48KCcn1yDF/htEVFdXLy4uzsnJ0dHRKSkp0dHRqaioaNeu3ahRo3g8np2dHUdKO/Z/jJQu5P/vKe/SpbxLF2lHQRrCoUPYrZvobu7PnmHPnii9x9lI+We4ezf26oV5eYiIT55g16546JDUgpGUlJQU9gw1Y2PjtLS0hi38/v37WlpaADB58uSGLfmdHBwcAGD//v2ImJeXN3/+fPbWmM6dOy9ZsiQ1NVUCkfynUSKsM7oVVpMxfDju2vVq8sAB7NdPWrFIORHa22NU1KvJkBB0cZFaMBKRlJTEnr/t4ODQSLcJjIuLYzfoX79+fWOUX5u9vT0AmJiYPHnyhM0RCoXnz5+fMWOGrq6uOCOam5uvW7eN7kv6b6iPkMiehw+hS5dXk127woMHor/v3pVKRJImfpsPH0Lty8ZrfxRNlKmpqbm5uaen57FjxxrpNoH29vY7duzgcrlz587dt29fY2xCbPTo0QBw584dfX19e3v7oKCg3NxcOzu7DRs2ZGVlxcXF+fv7a2trp6SkREdrdOwI3brBsmXw8GGjBvUfJO1M/N9BNcImo3dvPH361WRiInbtioh45QpyuRJ64iUiIla9vFGe5J7jkZmJvr7I5eKlS4iIJiaYlPRq6cmT2LevhCKRnpKSEgk8vWjVqlUA0KxZMzZ6pfGsX7/ew8OjWbNmbF+Sl5cfPHjwtm3bCgsL2QoCgeDQoUOzZuW1bCk6jHG52K8fbtqE9FBRho7sdUaJsMnw98fvvns1+dNPOGECIuKOHaiiggCorIyLF2PjPHJS7NSpU+bm5oqKisrKyl26dDl8+HCjbg5LS3HxYlRWRgBUURE9zGX8eFy16tU68+fj7NmNG4YsmTx5MgCMGjVKAtsqLy+Piori8XiKL29EqaSk5OLiEhIS8uLFC7ZORQVGRiKPh82aiY5ncnI4aBBu3YovV5FRdGSvM0qETcbTp9i+PS5dijEx+NNP2LYtpqeLFmVloZ8fcrkIgG3a4JYtWFPT4Nu/d++eq6srO1q1b9++ffv27G9XV9d79+41+OZQKMTwcNEjLjkc5PHw0SPRoocPsU0bXLECY2JwyRLU18daj9wj9VRVVbV27drKykpJbrSwsDAkJMTFxUXh5Z2IHRwyXFwwJOTVqV15OUZFIY+HioqiA1tDjxn6j6Eje51RImxKcnNxwwacORMDAzErC0tL0dkZjx4VLU1MRFtb0TdubY0XLjTUZktLSwMCApSUlABARUUlICCgoqJCIBAEBgaqqakBgIKCgr+/f1FRUUNtEa9cQTs70XuxssK4ONH8o0fR2RlLS/HJE1y/HmfOxN9+w2fPGmy7RNpycnJ+++03JycX1swBgGpq+OWXeOQICgSidZ4/x61bcdo0qQb6GaAje51RImzCfv5Z9P26uIjOjWtq8O+/UU9P1KMyfrzocotPxW7KzAbycTgcX1/fp0+f1l4hPz/f39+fXXmmqakZGBhYXV1dny1idjaOHy+q3erp4d9/i2q3aWno4iJ6vz//XK9NkP+CzEwMDEQ7O+RwRF+7ujr6+mJUFLK+6fBw7NbtVX/huHGiHmTZQUf2OqNE2IQJBBgYiGpqCIAKCujvj6xOVlqKAQGorIwczlQrK1aB+4TiExIS+vbty9qpevXq9Z7RE0lJSf369WNr9uzZ89y5c5/0bgSBgYFfW1khlyt6O+wp6uztKCmJugkDAlCyrXZEuu7dw59+QjMz0cGMnSDNmoV//ok6OjhunGg1JyeMjZVmnJJHR/Y6o0TY5OXno78/yskhAGpqYmAgsjrZ/ft3Xt5vpUOHDvv27at7kU+ePPH19WV392jbtm1ISEhdxitGRUUZGhqyLbq4uKSLuzDrYN++fR06dGCvvTNjBt6/j4hYU4MhIaijI6rg+vri6/VRIlNSUzEgALt0QQC0s8O//sKpU9HSEk+dQqRESN6HEqGMuHoVHRxEX7elJZ49y2afPn3awsKCJZiBAwcmJye/v5jy8vLVq1erqqqyMfQLFix48TEj88rKyt54eUlJyftfcvv27SFDhrAITUxMjhw5IlqQkIB9+ojekY0NXrxY9zBI05aYiHFx+NdfOH06xsWhiQlWVlIiJO/x4sVrQ4wFgtealSoqUGKXgpHGJhRiaCjq67Nhlufnzs3IyEDEmpqakJAQbW1tAOByub6+vrn/ch1Wfap0tdWxQllQUODv7y8vLw8AGhoagYGB7MLEjIyM83PnirqG9PUxNBQb//o58p/DEiEijh+PK1dSIiTvlJiI3bujtTVaWWH37piYiIgYFCTad5jx43HrVmkFSBpFeTmuXs03MWmlrFy7SldQUDBjxgyWdebNm/fGi65evSru5LO0tDz7skJZHwkJCX369GFl2tjYXHyrSjdv3jx2JfWMGTPYbcPEFUpVRcXKLl1wwQL8UIWSyCxxIszNRX19NDOjREjewOejsfGr+zHu24fGxsjnUyKUEVmPH48dO5bVydq3b79nzx5WJ0tNTeXxeLVvVtnwwz5rYZXRfxt0WlBQwOPx2L2VhULhnj172LWJHA5n7NixWY8fN1QYpEkSJ0JE3LQJASgRkjecPo3W1q/N6dkTY2MpEcqUy5cvv2fYZ+NeCFjL25chvnGxdoMMOiWy5vJlPHBA9HdNDS5fLrGbDH4uKBF+yPbtyOO9NiEi458AACAASURBVMfTE7dvx6AgVFdHExPRPzU1SoRN279dCHjixAlTU1OWe5ydnSXwyJt79+7xeDy2RWNj4/DwcETMy8trvPookRExMThpEsbESDsOiaNE+CHR0ejk9NocR0c8fJhqhLKpuLj422+/ZbdzVFVVZaM6AaBr165HxTemkYijR4927dqVbV0ciaKi4rffflvMrhok5CP9+CMC4Ny50o5D4ugxTB/SqxckJ0NOjmjy6VO4cQOsraUaE5Gali1brlmzhnUQlpaWlpaWKigorF69+vr164MHD5ZkJIMHD75x48aWLVsUFRVZJM7OzteuXVuzZg173h4hH6tnTwCAq1elHYfEyUs7gM+ejg4sXAj9+8O0aYAIv/8OS5aAtra0wyLSxBokt23blpub6+Hh0blzZ6mEIS8v7+fnN2DAgMjISB0dnYkTJ0olDNJkWFkBAFy9CojA4Ug7Ggni4MtnZJP3SU6GixcBAGxtgV1VfesWvHgBLwe1Q1wc6OpCp05Si5AQQuqtbVvIzob796FjR2mHIkGUCAkhhIiMGAGHDkF4OLwcjyUTqI+QEEKICOsmvHZN2nFIFiVCQgghIpaWALI3XoYSISGEEBGrnjixW8Jkub+kHYhEUR8hIYSQWrS0ID8fMjOhXTtphyIhVCMkhBBSi+w1j1IiJIQQUovsXVdPiZAQQkgtrEYoSyNHKRESQgipRfZqhDRYhhBCSC2IoK4OxcWQkwM6OtKORhKoRkgIIaQWDgd69ACQodZRSoSEEEJeZ2kJysqQnS3tOCSEmkYJIYS87sULUFCA/Hxo0wbk5KQdTaOjGiEhhJBaBAKYPRvMzeHrr8HYGObPh6ZeX6LnERJCCKnlt98gNxdu3QJFRSgpAScnCA2FsWOlHVYjohohIYSQWnbtgm+/BUVFAIAWLWD2bNi9W9oxNS5KhIQQQmrJzAQDg1eTRkaQkSG9aCSBEiEhhJBatLTg+fNXk/n5oK0Nz57B0KGwfTsUFUkvssZCo0YJIYTUMm0aqKvDihWiya++gk6dQE0NZswAAFBSgkGDgMcDd3do0UKKYTYgSoSEEEJqyckBe3twdITu3eHCBbh9G86fh6oqiIqCiAg4dgyqqgAAlJXB2Rl4PPDwABUVaQddL5QICSGEvK6sDE6fhvR06NwZHB1FA2eY3FyIiIDQUIiPF11WoaYGbm4p48Z1HTBAXv4/eSUCJUJCCCEf78kTiIyEiAiIj+crKuoqK3O4XBcXFx6PN3To0P9WRqRESAghpB7u3bt35Ij7n3+mpqayGW3atOHxeF5eXn369OFwONKNri4oERJCCGkAqampERERoaGhd+/eZXPat2/v7u7O4/Hs7e2lG9v7USIkhBDSkK5cuRIaGhoeHp6ZmcnmBAUF+fv7Szeq96BESAghpFEkJSXt2LEjIiLi7NmznTp1knY4/4oSISGEkEYkFAq53M/65i2UCAkhhMi0zzpLE0IIIY2NEiEhhBCZRomQEEKITKNESAghRKZRIiSEECLTKBESQgiRaZQICSGEyDRKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiVCQgghMo0SISGEEJlGiZAQQohMo0RICCFEplEiJIQQItMoERJCCJFplAgJIYTINEqEhBBCZBolQkIIITKNEiEhhBCZRomQEEKITKNESAghRKZRIiSEECLTKBESQgiRaZQICSGEyDRKhIQQQmQaJUJCCCEyjRIhIYQQmUaJkBBCiEyjREgIIUSmUSIkhBAi0ygREkIIkWmUCAkhhMg0SoSEEEJkGiXCd0DEiIiIW7duSTuQhnT16tWoqChpR0EIAEBxcXFERMSTJ0+kHch/T3h4+M6dO6UdRUM6cODA1q1bpRsDBxEltrHjx4+fPn3a0dHxiy++qD0/ISFh//791tbWnp6edSmnqqpq//79Fy5cePHihYaGxoABA4YPH87lipJ6YmJiZGQkj8ezsrKq/apffvlFIBB89913HA7n/eXX1NTIy8svX7586dKldX93ly5dio6ODggIUFRUrPur3qOoqGjt2rVeXl4WFhb1L+2bb76JiIh49uxZ/YtqDCkpKbt372Z/KykpaWtrf/HFF506dfrYcsrLy1esWOHu7m5tbd1QsS1btqxfv35OTk4NUtqGDRv09PR4PF6DlPYflZKSYmFhERERUcef/Gdi4cKF70zerq6unp6ex44d27dvn4ODg4+PT+2l8fHxISEhlpaWU6ZMEc/MyMiIjo5+8OBB8+bNLSwshg0b1qJFi7rEMHTo0MLCwkuXLtU97Lt3765YsWLp0qXGxsZ1f9V7FBUV+fv7+/n52dvb1780T0/Pu3fvpqSk1L+oTybRGmF4ePiaNWuWLVv2xvx169atWbNm27ZtdSkkPT29e/fuPj4+N27cKCsru3jx4siRI/v27Zubm8tWsLCwOH78OI/He/HihfhVwcHB8+bNMzAw+GAW/GRpaWl79uyprq7+5BIQsU+fPvfu3WOTZWVle/bsycjIaKAAP2t3795ds2ZNbGxsUlLSyZMnly9fbmJiMnnyZIFA8FHl8Pn8PXv2PHjwoD7BTJw48cCBA+LJsLCwGzdufHJpS5cu3bhxo3jyyJEj8fHx9QmPSEtNTU11dXV1dXV6evrOnTufPXvGJoVCIQAcOXIkODh4yZIlb9QuNm7cGBwcvH37dvGcCxcudO3adfHixcnJyWfOnJkwYUJgYGDjhV1ZWZmenl5ZWVmfQoyNjS9cuMD+rqmpSU9PLy0tbYjoPg8oQU5OTqqqqhwOJz09XTyzpKSkefPmqqqqJiYmHyyhqqqqe/fuurq6ycnJ4pmxsbEtWrRwdHQUz0lLS1NVVfX19WWT9+7dU1VVHTduXB3jZMls+fLldVy/obDq2vXr1xuj8GnTpmlpaTVGyQ0iIiICAC5fvswmq6urAwMDuVzujBkzJB+MoaHhn3/+2VClOTk5ff/99w1VWtPATiwiIiKkHcgn2rt3LwAkJCTUnunq6qqpqQkAly5dEs8sKytTVVXV1NTU1tYWz7SysurUqVNBQQGbLC0tff78eR03PWTIkN69e9f7HXyciooKDodz5syZxijcw8PDzMysMUquO3lJJt309HQej3fgwIGwsLAFCxawmQcPHuTz+VOmTNm2bRsiimtsOTk5YWFhT58+bd26tZeXV/v27QEgOjo6OTl5z549tVsLBwwYsHTp0vnz51+4cMHOzg4AOnXqtG7duilTpgwZMoTH4/n4+Kirq2/YsOH94Z09e/bs2bNVVVXt2rV7Y9HVq1cPHz5cVlbWpUuXsWPHKisrV1dXb9u2zc3N7dq1a2fPnrW3t9fX1z937pyfn5+8vHx4eHjLli2HDBkiLiE6OlpRUXHw4MFlZWWHDx++ffs2AJibm48cOVJOTg4AoqKirl27BgCRkZEJCQkGBga2trY7d+784osvjI2Nb9++HRcXVzskS0vLXr16AUBZWVlYWFhaWpqKioqbm5u5ubl4nadPn+7bty8zM1NLS+vu3bsf811JmZyc3MyZM2/evLl58+YFCxa0bdsWAF68eBEWFvbgwQNVVVUPD4+uXbsCwOnTp+Xl5Y2MjHbs2AEAM2fO3LFjx8CBA7t27Xrp0qWbN29OnDhR3Gx+48aNy5cvT5w4kb0wKSmptLRUX1/fy8urZcuWAHDx4sWUlJSnT5+eO3dOKBQ2b9583LhxwcHB7NPOzMyMiYmpHWenTp0GDhwIAJcuXYqPj8/Pz2/btq2Hh4euri4ApKamXrhw4c6dOwoKCsHBwQAwceLEyMhITU1NZ2dnVsLjx4/37t377NkzTU1NDw+PDh06AEBubu7BgwcnTpx4+fLl48ePKygoDBs2zNLS8p2f1Y0bNx48eODk5LRt27bCwsKpU6fq6urW1NRERUVduXKFw+EMHDiQteveuXPn2rVro0eP3rdv37Vr19TV1ceOHduuXbvU1NT9+/cLhcKBAwf269dPXHJeXl54eHhmZmaLFi1cXV3ZrlVTU7Nt27bu3bvb2NiI1wwJCTE1NWU75KVLl44dO1ZRUWFubj569GgFBQW2Tk1NzeHDhxMTE+Xk5MTfSFOSnp4+dOjQixcvhoWF9e7dm808dOhQeXn5jBkzVq1aVVZWpqKigog3btzw9fXV0NBg66ioqKioqLyn5Li4uAMHDmRnZ7dt2/bhw4fq6uriRcnJybt27Xr8+HG7du2++uor9h0FBgZaW1uXl5fv2bPHyMjoyy+/DA4Onjp1qr6+/o4dO3Jzc+fPny8uITw8PDs7e9asWc+ePQsLC0tNTeXz+SYmJl9//TWLMCQkhJ2hBgcHx8TEGBkZeXt7r1y5ksfj9ezZ89KlS7WbTwDAyclp0KBBAPD48eOtW7feu3evVatWPB6P/VKYS5cu7d27NysrS09P77M4NEks5VZXVysoKKxaterLL7+0tLQUzx8xYoSTkxM7ij19+pTNvHDhQsuWLQ0MDHg8noGBgZqa2p07dxBx2rRpAPDixYs3Ck9LS4O36nCjRo1SU1Pz9fWVk5OLi4t7T2wCgcDDw4PD4djZ2Xl4eLDuJXFpq1ev5nA4vXv3HjVqlIqKSq9evaqrq8vLywHA0dFRXV3dwsLixx9//O233wCgpKQEEf38/DQ0NPh8PiuBnRWuW7cOEe3t7Tt27Dhq1KgRI0YoKSkNHjxYKBQi4uzZs1l279Onj7Oz85IlS9LT0wFgz549iBgZGWn1koGBAQAEBQUh4uPHj42MjDQ0NEaNGtW9e3c5Obno6Gi20YMHDzZv3rxdu3ajRo1ycnJSV1f/D9UImePHjwPAjh07EPH+/fvt2rXT0tJi548KCgonT55ERB8fHwsLC21tbQsLCwsLi7y8PAAIDg5GxOjoaAA4e/asuEBXV9cBAwYg4pIlSzQ0NIYPH87j8XR0dPT09PLy8hBx8+bNrNuja9euzs7Oo0ePRkRlZWXW3nXhwgXxt8C6W+bPn4+I27dvV1FRGTx4sJeXl6GhoaqqampqKiIePHjQ2dmZy+UaGho6Ozs7OztXVlZ269bN29ubxXP48GFlZeUOHTp4eHh06tRJUVFx7969iMjaTj08PNq3bz969Ohu3brJy8vXfiO1rV27VktLy9zcvEuXLh06dMjKyqqsrBw4cKCiouKIESPY0YftzMHBwc2aNXN0dLSwsBg9erSOjo6Wltbq1avZp9q7d28OhxMSEsKKTUxM1NDQ0NPTGzVqlIWFBZfL3bBhA1vUp08fOzs7cQCsd+f48eOI+P333wOAvb29m5ubsrKyuJ2moKDAxsZGUVHR2dnZ3d3dxMQEmlyNsEWLFkuWLJk/f76enl51dTWbOWrUKAcHB7Yrsr0CEfX19bW1tZOSkuqyrUWLFgGAg4PDnDlzvL29NTU1xTXCvXv3ysvL29razp4929LSslmzZjdu3EBECwsLc3NzXV1dFxeX6dOnnzlzBgAuXLiAiL/99huHw3n48CErQSgU6uvrz507FxHHjBljY2MzY8aMmTNntm3b1tDQsLS0FBE3btzo4uICACNGjPDz81u7dm1WVhYAbN++HRGPHTvGe6l79+4AwHahpKQkNTU1ExOT2bNnOzk5cTicyMhIttEVK1ZwOBxbW9s5c+aMGzdOW1tb6jVCySVC1tcVGhrKxi7eunULEQsLC5WUlDZv3syqOxcvXkREoVDYuXPnnj17lpeXI2JRUZG2tvb48eMRcfjw4Zqamm8XzufzuVzupEmTas8sKCho06YNALCj2HusXr0aAHbv3s0mazeN3r59m8vlzps3jy06ceIEAISHh7NEaGdnx/YVRKydCNmed/jwYbYoPDycy+VmZmYiYn5+vni7LP1fuXKFTbJfl7hptHYiFBMIBDY2NlZWVgKBABE9PT1bt2795MkTFratra2FhQUi5ubmqqioODs7i8P7bzWNMqy7lH0RQ4cObdOmzbNnzxCxqqqqR48effv2RUQfHx9FRUX2I0fE2olQIBC0bt166tSpbFFxcbGysvKWLVsQ8cWLF+LTlIyMDA6Hw05TEPHx48cAULtpVJwIxYRC4YgRI4yMjIqLixGxoqJC/DmXlJRoaGiIN4qIioqKtZtGxYmwoqJCS0vLwcGhoqICEfl8/uDBg9XU1IqLi1kiHDt2bGVlJSJWVlbq6uqK0+cb1q5dK37LzJo1a7hc7rlz59jk9OnTVVRUCgoKWK30p59+YvOTk5MBoGfPnmyfFAqFNjY2NjY2bCnLrKzJTigUTpw4UV5e/sGDB/jyYCru4Fi6dKm2tnZVVRUbwSEun+3Px44dQ8Qvv/xSSUlJ3GbY9JpG8/PzAWDr1q0JCQkAwFoRX7x40axZsw0bNty8ebP2AeGff/5RUFDgcrkDBgzYsWMHO9C90/nz5wFAfPzBWk2jfD5fS0tryJAh7Ey6rKxMR0eH9QdZWFgYGRmJm1trJ8Jnz56xCknt8tkhqKqqSrwVtgeKU9epU6fEbwoRaydCsZKSkk6dOonP7O3s7IyNjcvKythSW1tbVv+5evUqh8OZNm0aWw0/j6ZRyTVQsMO6kZHRF198oaamFh4eDgD79++vqqoaOXKkoaGheJ3MzMy0tDRnZ+dbt24lJSXdv3+/e/fuLFPW1NSIW1pqk5eXl5OTe2OgSm5ublFREQDgh0bG7tq1q2/fvt7e3m8vOnPmjFAo7NevX1JSUlJSUqtWrZo3by5upXRxcXlnm4aDg4OhoWFYWBibDA8Pd3BwYC2urBehtLT0zp07SkpK4nddR0uWLElNTd2zZw/7HE6fPm1nZ5eTk5OUlHT9+nVLS8sbN24UFxcfOHCgrKxs1apV729y+cyxz6empkYoFJ45c8bBweHx48dJSUnJyck9e/ZMSEhgQ2n09fVtbW3ffrmCgoKnp2dERATbMQ4cOFBTU+Ph4QEALVq0UFRU5PP5jx49ysvLa9Wq1cOHD+se2KZNm2JiYnbv3s0aVJWVlVVUVKqqqp48eXL37t02bdrUpbSUlJS8vLxZs2YpKysDgKKi4rx584qLixMTE9kKLHmwz8HExCQzM/M9pY0ZM0b89+nTpzt27Ni8eXO203bt2rWsrIw1vAPA+PHj2R9mZmZcLnfo0KFsn+RwOBYWFmwrz549S0lJmTx5MmuF43A43333XXV1dWxsLAB4eXnJycmxcxcAiIiIGDNmjLy8fGxsLIfD6du3L9tumzZt5OTkzp8/z+fzIyIixo0bJ24wbHrEx7devXqJf/sHDhzg8/mjRo0yNDRkpw5s5TFjxty+fXvRokWPHz/+8ssvjY2N/23MZGRkpJycXEBAwNuL0tLS8vLyOnbsuHfv3oiIiMOHD7dv3/7KlStsaY8ePWq3oIppaWk5OzvXPjQZGxuzAfby8vJCofDOnTtnz55lYw/ZSWEdffPNN0VFRdu3b+dwONXV1ZcvX+7atevhw4cjIiIiIiLYwA6BQLBv3z5EXL58eeONW/wEkusjfPToEQAYGRkpKSm5uLiEhoYGBASEhYX1799fT09PKBQqKSmxddj/O3fuZKddDDsi6OjoPHv2TCAQvHGJQnZ2dlVVFeuYYQQCgY+Pj46OzvDhw1etWjVo0CAHB4d/iy09Pf2NEc+1w+ZwOLNnzxbPYR0w73+zHA5n9OjRmzdv/uOPPxAxJibm119/ZYuOHj363XffJScnKykpsTdV94GRcXFx69atCw4O7ty5MwC8ePHi+fPn58+fHz16tHidDh065Ofnp6enczgcMzOzOpb8eWKtCHp6erm5uRUVFSdPnrx8+bJ4qaGhITsNfw9vb+8//vjj1KlTgwcPDg8PHzJkCDvo379/f9asWTExMRwOp2XLlsXFxVVVVXWM6tatW99+++3ixYv79u3L5uTn58+cOXPv3r1VVVWtWrUqKSnR1tb+YDnssMgauhkjIyM2v1u3bm+sLC8vX/cIHz169PTp0zf2itqDqBkul/tGXx07FMLL3yA7PWVqH8rFB9P58+cnJyffvn2bDfl+9OgRl8v9+uuvxa8yMDAQCATs66vde930iBMhh8Px8PAICQnZsGFDWFiYra0t6+HW0tJinyrTsWPHH3744Ycffjhw4ICvr+8333xz7ty5t4t98uRJmzZtVFVV317ETllOnjxZ+1IKHR2dD4bq7e3t6+t769YtExOTyMhI8fcVHR09c+bM7OxsPT09NqfuY+AjIyN37tx58OBBdhDOycmpqqpKTk7Ozs4Wr2NpaVlYWPjkyRMNDY3WrVvXsWTJkFwiTE9PV1FR0dLSAgBPT8/du3efOnXq1KlTQUFBAMDlcvX19dmOwg5V69ev9/LyeqOQPn36hISEJCYmskExYhcvXgQA8YEJANjQZFZhSkxM9PHxSU5OFvdOv0FdXf3fxhZraGgg4vXr19+4yqeiouL979fb2/vnn3+OiYkRCASsDxIA7t+/7+bmNnbs2CNHjrRp0+bu3buss6QuioqKxo0b5+rqysZ6AICqqqqysvLXX3+9atWqN1Zu1aoVIvL5fFbb+I9iTTr9+vXT0NDgcrkzZ85cvHjxR5Vgb2/PTs979+594sQJ8RD2YcOGtWzZ8vr162ZmZhwOhx2q6oLP5/v4+JiZmS1cuFA8c9y4cXfu3ImNje3du7ecnNx7TrlqY/t5cXGxeA5rwKj/MUJTU7N9+/asGb+2P//8s+4liONhWCOwODYfHx9fX9+0tLTw8PAOHTqwqp6mpqacnFxaWhob/yVWUlICAPUcvv+Ze/Tokby8PGv18fDw+OWXXyIjI0+cOLFmzRq2gpGR0Tvbftzc3Nzc3Pbt2/fOYrW1tdmnJyZu32IpZ9WqVe7u7h8Vqru7u6qqanh4uKOjY1ZWFmtLyM3N5fF43t7eGzZsUFVVLSkpYa0ddfHkyRM/P79vvvlmxIgRbI6WlpacnNykSZPevhRbW1u7rKxMKBSKT8I+2GInAZJrGs3IyBCfYA4ePLhFixb/+9//hEKh+FsU7ygdO3ZUVVU9dOjQ24WMGTNGXV39u+++q/2jKi4uXrZsmaGh4bBhw9icc+fO/frrr0uWLOnfv7+8vPzu3buLi4v9/Pz+LTYTE5Nz586JT39qX+bFun8PHz78se+3e/fuZmZmYWFhtSsily5d4vP5CxcuZJ2Xb+ziLGn92/Fi6tSpNTU1tY9lXC7XzMzsyJEj7Cy+NvGISjZZU1PzUVfgfg6Sk5PXrl07ZMiQbt26KSkpdenS5RO+BQ6HM2bMmMjISNaYzH6oWVlZ9+7dmzx5srm5OYfD4fP5tSvl7Fvg8/nvLHDhwoX379/fvXu3uIkeEc+dOzdmzBhbW1s5OTlEfOP6KmVl5Xd+p2ZmZnJycrXfVHR0NIfDYbtcfXTv3j0hIYF1l34aAwODVq1avREbAPTo0YNNuru7q6iohIWFRUREjB07ljVzWVhYCASCtxNwixYt2rZtK94b4fWfWNPw6NGjdu3aycvLA0CfPn3at28/Y8aMqqoqdgYMryfC2od+RLx///6/nYqZmZkVFRWJ+2Ly8/PFjSJdunRp0aJFaGjox4aqoqIyYsSI0NDQ8PBwa2vrLl26AMDNmzf5fP6ECRNY7fPp06dvvAQA3nnhoFAo/PLLL3V0dH7++WfxTCUlJXNz8717977deGZmZsbn89k4OAAoLi4WX54oRRKtEbKWHwBo1qzZ8OHDQ0NDHRwcxNVwQ0ND1gOhrKy8cOHCRYsWtWzZcujQoWVlZbGxsXZ2dr6+vq1atfr7779Hjx5tY2Pj5+enq6ubkZGxefPmZ8+excTEsGNTUVGRr69vz5492WgrAOjYsWNQUNDEiRO3b9/+1VdfvR3b7NmzXVxcPD09J06cmJ2d/dNPP7EdGgAGDRrUr1+/KVOmZGdnm5qaPnny5ODBgxs3bqxL25e3tzcbOPDXX3+xOSYmJhwOJzAwkN0QYOXKlbXX7969u5KS0k8//eTt7d2iRYvaDZs7duwIDQ0dN25cZGQkm2NgYDB48OCAgICRI0e6u7uPHz+ey+VeunRJSUlp+fLlQ4YMMTU1nTJlSn5+vq6u7l9//ZWamlr3Uzxp2bFjx+nTpwUCwe3bt/ft22dsbCy+99KyZcvGjBkzevRob29voVAYHx+vqanJxii+n4+Pz+rVq5cuXerq6sp+5Nra2pqamjt37jQxMcnOzl6/fn3tapmWlpahoeEff/yhqalZUVExYcIE8aJTp06tX79+2LBhsbGxbF9t3br1qFGjTExM2ADR8vLy4ODgmzdv1r4IoVevXmFhYZaWlsXFxd988414vo6OzpQpU4KCghQUFOzs7K5cubJy5cqvvvrKyMgoJyenPh/j3Llzd+/e/cUXX8yfP79ly5apqakpKSm7du2qewny8vKLFi2aP3/+lClThg8fnpaWtnz5ckdHx/79+7MVVFRUXF1dg4KCCgoKxN2T7u7ulpaWvr6+AQEBxsbGGRkZUVFRISEhrVu3nj179rx586ZOneri4nLz5k1xPanJqH18Y62jgYGBffv2Zdd9AYChoSE7+r948aJ79+6urq7W1tZcLvfAgQOXLl16+zYjjI+Pz7p16zw8PObMmaOsrPzHH3+IFzVv3nzFihX+/v7V1dXOzs5lZWUnTpzw8PB4zxl/7WL/+eefJ0+eLF++nM3p1q2biorK8uXLv/7667S0tODgYNZxI17aunXrpUuX3rlzh3314kWBgYGxsbHjxo0TX5/WuXNnd3f3n3/+efjw4f379/fy8mI/2LZt2/7666+enp6rV6/29vaeM2dOy5Yt//zzz3876ZQoiQ3LcXV1FQ9VQsQjR45YWVmJx2oj4tatW9k4QCY4OLhXr15aWlqmpqaTJk26du2aeNG1a9e8vLzatm3bvHlzfX39CRMmpKWliZeuWLHCxsZGaqrq0gAAA0FJREFUPD5YbOrUqf379/+3C1fZgUxPT69Xr15RUVE+Pj7icYMvXryYM2dO586ddXR0rKysli1b9uzZs8rKSisrq7///ltcQmhoaI8ePWoPAEtPT7eysurduzcbSsps3rzZwMBAWVnZ3t4+KSlp8ODBMTEx4qWRkZHW1tadO3desmRJdnZ2jx49jh49iogBAQFWr5s9ezZ7ydGjRwcOHKijo9OpU6dRo0axQXqI+PTp00mTJnXt2pWdnIaGhjo7O7/zvX8OTp48KX5rNjY2I0eO3LRpExtOKRYVFdWvXz9tbe3OnTuPHj369OnTiLh48WI3NzfxOoWFhT169GAd8mJubm5WVlZsfD9z7tw5S0tLZWXlLl267N69e9GiRT/++KN46ZUrVxwdHdlVDUKhsE+fPr///jsibt68+Y1vgY3kTE1NdXBwUFZWNjQ0/PXXXzdt2jRlyhRxaQ8ePHB1de3QoYOzs3NRUZGnp+fChQvZourq6rVr1/bo0UNDQ8PCwmLFihVs5N6NGzesrKzi4+PFhUydOnXixInv/Oh27txpZWUlHp7H3Lx508PDo02bNvr6+s7Ozn///XdNTc3+/futrKxyc3PFq9nY2GzatEk8uXLlykGDBoknt27damNj07p1axMTk/nz54tHxjInTpywsrKq/eEjYkFBwfTp0zt27Kirq2tjY7Ny5crCwkK26Pfff7ezs9PT0xswYEBCQoKdnR27AOa/KD4+fsSIEbUPO/PmzRNfXoKIV69e5fF4YWFh4jkxMTE8Ho/P5xcVFa1evbpv375aWlqqqqo9e/YMDg4WD6F8W05Ozrx58wYOHDhgwICtW7eGh4eL9x9EjIqKGjlypJWV1dChQ3/88Uc2gHzevHlr164Vr3Pz5s0RI0bcvn1bPEcgEHh7e/N4PLY+c+bMGScnJ2NjYzc3t9TU1O+//z4qKkq89MqVK56env369Vu5cmVBQcGIESNOnTqFiBs2bOC9TrzphIQEHx8fa2trJyenefPmsSsFEDEvL2/BggWOjo4ODg6///77wYMH58yZU/cPvzFI9F6jhBBCyOemCd7fgRBCCKk7SoSEEEJkGiVCQgghMo0SISGEEJn2f9LEyEI8lnPbAAAAtXpUWHRyZGtpdFBLTCByZGtpdCAyMDIyLjA5LjEAAHice79v7T0GIOBlgABGIOYCYk4gbmBkY0gAiTFzMGgAaWYmdgjNzMaQAaahCpgZOSAKGeESMJUwnTCTuIE2MDIpMDFrMDGzMLCwMrCwMbCxazCxcyhwcDKIMIg3QR0BBlwP3ZYdmDVTch+IE5RVCWLbg9gP3dTg4kA1+2HiIAATBwFj48v7oWrskfQ6IJnpgBBfBhcXAwC5pyuKulnfWQAAAQp6VFh0TU9MIHJka2l0IDIwMjIuMDkuMQAAeJyNUkFuxCAMvPOK+UCQDQvBhx42yXZVVUukNu0feu//VaNV1s1ho0As2WQY22Mc2vqY3n9+8Vhhcg6gnU9E8B2JyN3QHAyX61vFuJyH9WScv+ryCSaIXtG9hZ6X+baeMEZ0ybNkTgEdeU5KrVc80d1ZkQEzuuiL9Mwn/Z/pCTCiogs+SaE+71KeWnL2QYRi2aNMCnyc7jFmLdKA7PMzYK+MRzIXbeZQL6KZj6ijQxlxSPBLnTajug9vmOtkw2s72IQ0QLQ5aNA0fr2+BJOc1ZIJy2rZ5GthbyIFtbJlKIopEOubm22q/l9ji9e3qb77A9oDjei/gQhpAAAAcXpUWHRTTUlMRVMgcmRraXQgMjAyMi4wOS4xAAB4nF2NQQrAIAwEv9Kjgi6JrdZQPOWuH/LxDZQW7CVMdkhWR2+qbnhtfeg2XcxgKZxDJNi84o4qJ3MgFNsSslSqn2UkEToeS29swCgGi/7dLo/XVj9vdX8e8lB+vk4AAACIelRYdHJka2l0UEtMMSByZGtpdCAyMDIyLjA5LjEAAHice79v7T0GIOBlgABGIGYDYlYgbmBkY0gA0syMHGCaEcjPAPFBAhogBhOGADfQBCZmDiYmBgYWVg4mFgYGEQbxIKjBYMDGgAEO2IPIh27L9jMwOOyHsNUOANlqUHF7JHEHILUMxBYDAK2aEwQhr270AAAAznpUWHRNT0wxIHJka2l0IDIwMjIuMDkuMQAAeJyNUVEKwyAM/fcUuYAS02rrZ1vLGKMKW7c77H/3Z3HD2sJalhiI4fmSFwUku/rL8wWLkRcCAA+Ocw4eFSKKCVIC/Xg6Bxjmrs+VId7DfAMLhl+wb5HdHKdc0TAAKvzYryTjCGKpamX2cBXzSa3IOaxakKgas4OsmVGSMq7Fxh61Nkz5F6NlxoVQHjCOwW+W8F1LH4Mva0lORT2xsrqI1CmKEuIhbZlXp1g3W1One/4rzsUbuBthm1EqyJUAAABYelRYdFNNSUxFUzEgcmRraXQgMjAyMi4wOS4xAAB4nPO3dXbW8Nd0tvVXqNHQNdIztbQwsNAx0LHWNdQzsrQ0MNHRNdAzN9WxNgAJGugY6gHZaFIIXZo1AK59EJiYGOg0AAABFHpUWHRyZGtpdFBLTDIgcmRraXQgMjAyMi4wOS4xAAB4nHu/b+09BiDgZYAARiDmg/IbGNkYEoA0MyMHgwKQZuJTAFEsbAoGIFFMmiEDRDMzsjNogBkcEJqJaAXcjEwMTMwMTCwMTKwgp7CxazCxcyhwcDKwMTBwcWswcfMo8PAycDEwiDCIb4I6Fwz4/v1j3P/GVcsexNGd9G1/Xs9kMNt0i/OBB/w6diD29pjGAxYRy8Div9exHGhYUbYfxF7J33bgrUAgmF21Vt4+U/cyWM22LUwOHw5PAbOvyHs7GKZzOoDYU79McnidcgUsniGRtO/lQj+w3oX+M+1zdy0Fs4V59tnriBseALETz1s6vPAOALPFAP+AQEuIjOWqAAABm3pUWHRNT0wyIHJka2l0IDIwMjIuMDkuMQAAeJx9U8tu3DAMvPsr+AMr8CVRPOSwjzQI0niBZpt/yL3/j5KbbuVFhMg+SPSQImfGC+T6dXr5+AP/F5+WBQC/ed0d3gURl1fIDRwen55XOF72h1vkeP69Xt6AFEgiJ5577P5yfr1FCI6ww1LRzA2wtFaJPTZ4XSOV4Qw7Ks5NpAEVqmwsE6DA2wfspGC1bhoAah3vSsL7/ueD3vCaHWhB5dp6FGb32mxSuCaQC5pQx2zam1Sc9do+S1J3JU5kpxaDTZAWyBiaxZTi7qbep0N1WIGzR46rY/jGHNGvOA+WpJBJpx4JpM2n9ShUAi01vtZks1WXaUH6JxAzmrfrLGhU+wzK0WT05mYqqRW7WuUZUqJNKtr1iuRizVBmTFKqw8VRuHOqylXDABPk43q6M9an1Q7n9TSsRuEiGYai8IoO2+SxDlfksQ3tM5eGwBzSdTj+eHrgoSVFyIdkFHLQRhpKyEYBTnaDtbsqGcogyYZXSspIN/zRFbgdfjtqnm9/ZOyXv7rRvFPuiJiBAAAA1npUWHRTTUlMRVMyIHJka2l0IDIwMjIuMDkuMQAAeJwljktqBDEMBa+SZTe4hf6yMLPyfnoxy5AD5A5z+EgTMNg8Patq38/H3sd+PO993t+v359jn3X21/tQsGTxQeCWImMJUMjkwUDqyWMxoDLPapAzVwPBWUKp/2hWdV0IhhEZo2ZuxPnJmDHSR70mBtkcq3ZkhI6LgFPDOtGpnTCEB2rzEqV4lwCbegGrneyirWAcDRRAi9k88okfnranxejVWfeqlRhCswXSxfC/RTPLvaXIS3ac7z8I1Drprs5uMQAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles = 'CON=CC(O)C=NOC'\n", + "ShowMols(\n", + " [\n", + " der := Chem.MolFromSmiles(smiles),\n", + " orig := remove_derivatization_groups(mol=der),\n", + " add_derivatization_groups(mol=orig)\n", + " ],\n", + " legends=['MeOX derivatized','Derivatization removed','TMS derivatized'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Read the input file\n", + "\n", + "The file is parsed line by line, errors are reported and ignored otherwise. \n", + "\n", + "The result is `mol[]`, a list of pairs (_original SMILES_, _RDKit molecule_)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "#smi_file='NIST_Si_100.txt'\n", + "#smi_file='NIST_Si_all.txt'\n", + "#smi_file='NIST_SMILES.txt'\n", + "smi_file='NIST_195_200.txt'\n", + "with open(smi_file) as f:\n", + " mols = list(filter(lambda p: p[1], [ (smi.rstrip(), Chem.MolFromSmiles(smi)) for smi in f ]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Essential statistics\n", + "\n", + "Count occurrences of (one-),di-,tri-methylsilane, TMS attached to -O, -N, -S, and methoximine. " + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# total 4\n", + "# with SiMe: 1\n", + "# with SiMe2: 1\n", + "# with SiMe3: 1\n", + "# with ONSSi: 1\n", + "# with MeOX: 3\n" + ] + } + ], + "source": [ + "SiMe1=Chem.MolFromSmarts('[Si][CH3]')\n", + "SiMe2=Chem.MolFromSmarts('[Si]([CH3])[CH3]')\n", + "SiMe3=Chem.MolFromSmarts('[Si]([CH3])([CH3])[CH3]')\n", + "ONSSi=Chem.MolFromSmarts('[O,N,S][Si]([CH3])([CH3])[CH3]')\n", + "\n", + "print('# total',len(mols))\n", + "with_sime1 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe1),mols))\n", + "print(\"# with SiMe:\", len(with_sime1))\n", + "with_sime2 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe2),mols))\n", + "print(\"# with SiMe2:\", len(with_sime2))\n", + "with_sime3 = list(filter(lambda m: m[1].HasSubstructMatch(SiMe3),mols))\n", + "print(\"# with SiMe3:\", len(with_sime3))\n", + "with_onssi = list(filter(lambda m: m[1].HasSubstructMatch(ONSSi),mols))\n", + "print(\"# with ONSSi:\", len(with_onssi))\n", + "\n", + "MeOX=Chem.MolFromSmarts('C=NO[CH3]')\n", + "with_meox = list(filter(lambda m: m[1].HasSubstructMatch(MeOX),mols))\n", + "print(\"# with MeOX:\", len(with_meox))\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Inspect whatever from the sorted categories" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with_sime2[0][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[08:23:48] Molecule does not have explicit Hs. Consider calling AddHs()\n", + "[08:23:48] Molecule does not have explicit Hs. Consider calling AddHs()\n" + ] + }, + { + "data": { + "application/3dmoljs_load.v0": "
\n

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n jupyter labextension install jupyterlab_3dmol

\n
\n", + "text/html": [ + "
\n", + "

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n", + " jupyter labextension install jupyterlab_3dmol

\n", + "
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw3d(with_sime2[0][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[08:23:49] Molecule does not have explicit Hs. Consider calling AddHs()\n", + "[08:23:49] Molecule does not have explicit Hs. Consider calling AddHs()\n" + ] + }, + { + "data": { + "application/3dmoljs_load.v0": "
\n

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n jupyter labextension install jupyterlab_3dmol

\n
\n", + "text/html": [ + "
\n", + "

You appear to be running in JupyterLab (or JavaScript failed to load for some other reason). You need to install the 3dmol extension:
\n", + " jupyter labextension install jupyterlab_3dmol

\n", + "
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw3d(with_onssi[0][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAASLklEQVR4nO3de1TUZf4H8PcMtxAYIRADxUIgxONdVi2N44U0Ny1LcV3XTFMH199G+zun01Dtb/Wsnt1hyz3UabczbKnVbltgauS6bpibmWkBZaumCZgiN+M+MILcnt8fM0IgTMDA92G+836d/kjnkflwsjef5/J9RiOEABER9ZdWdgFERM6NMUpE5BDGKBGRQxijREQOYYwSETnEXXYBRDTAzpw5o9VqIyMjvby8ZNfiEtiNEqnNs88+O2HChMOHD8suxFUwRonUJi8vD0BkZKTsQlyFhsfvidSktbV12LBhLS0t9fX13t7esstxCexGiVTlypUrTU1No0ePZoYqhjFKpCr5+fngjF5ZjFEiVWGMKo8xSqQqjFHlMUaJVIUxqjzGKJGqMEaVxwNPROrR1tbm4+Nz48aNuro6Hx8f2eW4CnajROpx9erVxsbG0NBQZqiSGKNE6sHnl6RgjBKph3VhNCoqSnYhroUxSqQeBQUFACIiImQX4loYo0TqwW16KRijROrBtVEpeOCJSCWEED4+Pg0NDWaz2c/PT3Y5LoTdKJFKFBUVNTQ0hISEMEMVxhglUgkujMrCGCVSCcaoLIxRIpWwxihPOymPMUqkEjx7LwtjlEglOKmXhQeeiNRACOHn52exWGpqaoYPHy67HNfCbpRIDUpLSy0WS3BwMDNUeYxRIjXgjF4ixiiRGlgfA+X+khSMUSI14N1OEjFGlVNZWblz5862tjbZhZAKcVIvEWNUOY899tjTTz+9dOnS6upq2bWQ2vBuJ4kYo8p55plnRo4ceejQoSlTpmRnZ8suh1TFOqlnjErBGFXO3Llzc3JyZs6cWVhYGBcX9/rrr8uuiFSirKysrq4uKCgoICBAdi2uiDGqqNGjRx87dkyv1zc2Nm7cuDExMbGpqUl2UeT0uDAqF2NUaV5eXiaT6Y033vD29k5LS7v33nuvXLkiuyhyboxRuRijcqxdu/bEiRPh4eG5ubmxsbFHjhyRXZE0zc2orkZ1NXrqy62vtrYqW5ZT4d1OcjFGpZk6dWp2dvbChQsrKioeeOCBlJQU17zf4OhR3H47br8dq1d382prq+3Vb79VvDLnwbud5GKMyhQYGHj48GGj0SiESE5OXrZsWW1treyipHnvPXzwgewinBMn9XIxRiXTaDQGg+H999/39/fPzMycMWPGuXPnZBclgbs7ACQl4fp12aU4IcaoXIzRIWHJkiXZ2dkTJ068ePHirFmzMjIyZFektPBwzJ+Py5exfbvsUpxNeXl5bW2tv79/YGCg7FpcFGN0qIiMjPz888/XrVtXX1+/cuXKxMTE5uZm2UUpymiEVoudO+GS7Xj/cWFUOsboEOLt7b17926TyeTp6ZmWlhYfH19WVia7KOX85CdYuxbNzdi8GS652dZPfAxUOsbokKPX6z/66KOQkJBPPvkkNjb25MmTsitSzh/+AJ0On36KXbtkl+I8rI+BshuViDE6FM2ZMyc3N3fOnDnFxcULFiwo3b1bdkUKueMObNsGAM88g/JyycU4Cx4alY4xOkSFhIQcPXo0KSnp/2bMCHniCaxZ4yJ72E8+iUmTUFUFg0F2KU6C2/TSMUaHLg8Pj5deeunZxET4+ODvf8fs2bh0SXZRg87dHX/5CzQa7NmDL76QXY0z4BaTdIzRIe/nP0dODmJicPo0pk7FgQOyCxoY336LY8e6f2n2bKxbByHw1FPca/oRlZWVVVVVOp1uxIgRsmtxXYxRZzBuHD7/HMuXw2zGo48iORlOe4V+UxMyMnD//YiJwcaNPabkCy8gKAinTuHdd5Wtz9mwFR0KGKNOws8PGRm2o5UpKVi6FM52hX5+PgwGhIVh5UocOQIfH8yfD4ul+8GBgdixAwCef17JGp0PF0aHAsao89BoYDDgyBEEB+PQIUyZAme4Qr+1FUeOYOVKjBuHP/4R33+PmBgYjSgshMkEX98e/+CmTZg1C10uEbx2jdP8ThijQwFj1NnMnYucHMycicJCxMUN5QOWxcVISUF4OO6/HxkZ8PBAQgKysvDNNzAY8KPXtGu1eOUVuLl1/E51NWbPxrJlcOH7W7pijA4FjFEnFBaGY8ewaRMaG7FhAxITe7yqU4a2Nlv7eeedSE7G1asYNw5GI4qKkJ6O+Pg+fKnp05GY2PHLb75BRQUyM3HPPbhwYcALd0qM0aFA45p3XKrEm29i82Y0NCA2Fnv34s475ZZTUoK33sKrr9pm4l5eeOgh6PVYsAAaTY9/qqwMx4/D1xeLF3fzam0tPvwQABYtgk6HvDwsX44zZ+Dri127kJAwON+J8xgxYkRFRUVpaekdd9whuxbXxRh1cl9+iRUr8N13CArCP/7Rt2ZvoLS2ZmfV/O7Pgf/6l+2O+pgYJCZi7dofn7n3Q0MDtmzBnj3QaPDkk3jxRXh4DPy7OIWampqAgABfX1+z2ayx85OKBhkn9U5u2jRkZ2PhQlRU4IEHkJKi6BZMaSlSUhAZOfJ3/3PwINzcbKuf587hqacGJUMBeHtj926YTHB3x8svIz4ernR/SyftM3pmqFyMUecXGIhDh7B1K4RAcrISWzBtbTh8GI88gjFjkJyMy5fDzOde/lNLcbFt9VOB/6n1ehw9ipAQfPIJYmPhSve3dLDeS+v2w204koExqgpubti2De+/D39/ZGZixozBurPz2jWkpCAqCosX48ABaLXW/lNz5r9P/q97UNCgvGdP5szB6dOYPx/FxYiLQ0qKou8+FOTk5ADw9PSUXYir49qouly8iEcfxblz8PNDdjaiozu9WlSEjz9GSQkaGhAYiGnTMGOG7eM77BMCH32EtDQcOADrZdKjR2PDBmzZguDgQflGeq2lBb/5jS1Df/ELpKVh2DC5FSlk7969a9eubWhoePzxx/fs2SO7HNcmSGWuXxfr1omHHxZtbR2/eeGCePBBAXT9JzRUvPaava9WVSVMJjF+vG28m5uIjxfp6aKlZbC/jz55+23h4yMAMWWKKCiQXc1gamxsTE9Pv/PmqQwPD49Tp07JLsrVMUZVqrGx499PnRIBAQIQ/v7il78Ur70m/vY38fvfi2nTbOH4q1918xVycoReL7y9bWNGjRIGg7hyRbHvoIvnnhMnTtgbcP68iIkRgNDpxP79SpWloLNnzyYlJfn7+1sDVKPRTJ48+dChQ7LrIsao6lVXi7AwAYg5c0R5eaeX2trEiy8KjUYAYvfujvEmk5gwwZaeWq2t/WxuVrz0DgcOCEB4eopXXrE3zGwWy5cLQGg0wmAQra1K1TeYrO1nfHx8+3b89OnTX3311fIu/zVJHsao2m3fLgAxYkTXDG2n19uazaYmIYQwGGwBGhYmtm0TRUVKFtuT5mZhMNgCf/VqUV/f48i2NmE0Cjc3AYif/lRUVSlY5UC7ePGiwWAIurlzp9Pp9Hr9V199Jbsu6ooxqnYREQIQzz/f44ArV4RWKwDxz38KIURBgXjwQZGZOdRWP4UQ+/cLnU4AYvJkkZ9vb+R//iOCgwUgxowRX3yhVH0D5MaNG7e2nyaTqa6uTnZp1D3GqKqVlNhay+PH7Q2bNEkA4rnnlCqr/y5csG136XRi3z57IwsLxcyZAhC33SZef12p+hyTl5dnMBiCbx5+8PPz0+v1ubm5suuiH8EYVbXjx20x+v339oatWiUA8bOfKVWWQ8xmsWJFrxZAGxrEE0/YRv72twdu3LihYJl90NLSkpWVtWTJkvb2c/z48UajscqplyRcCWNU1Q4etMWo/QTZvFkAYtEipcpyVFubSE0V7u4CEIsXi8pKe4PT0sSCBacA3HPPPUVDY6m3XVFRkdFoDAsLs6bnbbfdlpCQkJWVJbsu6hvGqKr9+9+2GL1+3d6wDRsEIJYuVaqsgfHxx2LkSNtmmP0F0Nzc3PDwcABBQUFDIaRaW1uzsrISEhLan+McN26c0WistP8DgYYqxqiqZWfbYtT+ec+HHhKAePxxhaoaOFevilmzbAugf/2rvZEVFRULFy4E4ObmZjQa2374bIKCiouLjUbjmDFjrOnp5eXF9lMFGKOqVl9vm/pmZtobZj1YunOnUmUNpMZGkZRk+2Hx2GP22u6WlpatW7dqtVoADz/8cE1NjWJFtref7jcfvY2OjjYajTz7qQ6MUbWbPftHOs2TJ20hdPq0clUNtDffFMOGCUBMny6++87eyA8++MD6IFB0dPTZs2cHu7CSkhKj0XjXXXd1aT9ltcM0GBijavfWWwIQHh7dp2RLi5g7VwDi3nsVr2yAffmlCA8XgAgKEh9+aG/kxYsXJ06cCMDX1zc9PX0wimlvPz1u3ikdFRVlNBq/t39kgpwTY1TtWltFXJwAREiIOHas00uVlWLlSlvIquJwYkWFWLRIAOK++17bunWrnY7v+vXr69atsz6ZnpSU1GR9gmsglJaWGo1G644WAE9PT7afqscYdQFlZWLqVNv5ydhYsXmz+PWvxSOPCD8/AQgvLzE4HZkULS3ihRfOWw9gLl++3Gw22xlsMpms3WJcXFxpaakj73tr+xkZGWk0Gq9du+bIlyWnwBh1DRaL2LFDhIZ2uiXPy0ssWyYGf31QeQcPHgwICABw9913nzlzxs7I48ePh4SEABg1atRnn33Wj/eqqqpKTU2NiIiwpqebm1t8fHx6enrL0HualgYJY9TFnD8vjhwRmZni5El7N3w4v7y8vEmTJlkXQN999107I69duzZv3jzr/k9qamrv3yInJ0ev13t7e1sDdNSoUQaDobCw0OHayckwRkm1Ghoa1q9fb804vV5vZwG0ubnZYDBYR65Zs8Zisdj5stXV1SaTacKECdbxWq2W7aeLY4ySyplMJuunFd133332F0DffvttHx8fAFOmTCno7g59a/s57OanlISGhhoMhsuXLw9a7eQcGKOkfp9++mloaKh13n3C7h3658+fj4mJsV7uuf/mHfo1NTUmk8m6RPDD9rNZ6lXWNHTwI+3IJZSXl69atero0aPu7u47duxon8Lfymw2r1+/ft++fRqNZs2aNQAyMjIaGxsBjBo1asOGDRs3bmy/TIQI4Efakcv44QLo6tWr63veYWtra9u+fbtWq7XePN/efg7g8VJSE3aj5FreeeedTZs21dfXT548+b333ms/qHSrhISEvXv3zps3b9euXe1PcxLdSiu7ACJFrVq1KicnZ/z48V9//fW0adP279/f08iWlhYAW7ZsYYaSfYxRcjnR0dEnT55ctmyZ2WxesWJFSkpKt8Py8vIAREZGKlsdOR/GKLkinU63b9++1NRUNze3boNSCHHp0iUAdmb9RFZcGyWXdunSpbFjx976+1evXh0zZkxISEhJSYnyVZFzYTdKLq3bDAWQn58PzuipdxijRN1gjFLvMUaJumGNUS6MUm8wRom6YY3RqKgo2YWQE2CMEnWDk3rqPe7UE3UlhPDz87NYLDU1NcOHD5ddDg117EaJuiotLbVYLMHBwcxQ6g3GKFFXnNFTnzBGibqyPgbK/SXqJcYoUVcFBQXgaSfqNcYoUVec1FOfMEaJuuLdTtQnPPBE1JVOp6urq6uqqrJ+2D2RfexGiTopKyurq6sLCgpihlIvMUaJOuHCKPUVY5SoE8Yo9RVjlKgT3u1EfcUYJeqEdztRXzFGiTrhpJ76igeeiDrx9/evra2tqKgIDAyUXQs5B3ajRB3Ky8tra2v9/f2ZodR7jFGiDlwYpX5gjBJ14GOg1A+MUaIO1rud2I1SnzBGiTrw0Cj1A2OUqANPO1E/MEaJOnCLifqB50aJbCorK4OCgnQ6XW1trexayJmwGyWyYStK/cMYJbLhwij1D2OUyIYxSv3DGCWyYYxS/zBGiWwYo9Q/jFEiG8Yo9Q8PPBEBQE1NTUBAgK+vr9ls1mg0ssshZ8JulAj4QSvKDKW+YowSAbzbiRzAGCUCePaeHMAYJQJuXpHHu52oHxijRAC36ckBjFEigGuj5AAeeCJCXV2dTqcbNmxYfX09d+qpr9iNEnW0osxQ6gfGKBEXRskhjFEixig5hDFKxE+yI4cwRol49p4cwhgl4qSeHMIDT+TqLBaLn5+fl5eXxWLRatlYUJ/xLw25uvz8fCFEREQEM5T6h39vyNXx+SVyEGOUXB33l8hBXBslV1dfX5+Xlzd8+PCxY8fKroWcEmOUiMghnNQTETmEMUpE5BDGKBGRQxijREQOYYwSETnk/wFF9Q7vAtXCxQAAALJ6VFh0cmRraXRQS0wgcmRraXQgMjAyMi4wOS4xAAB4nHu/b+09BiDgZYAARiBmB2I2IG5gZGNIAIkxQ2gmJjYGDSDNzMIOoZk5IDQTqjpGZm6gOYxMDEzMGkzMLAosrAxAFSIM4nFQC8CAPXS9oMMD+0P2IE6ILYvDui4WWxB7xSkd+xx79v0g9r/pjPuV75+yA7GDH37aL5GksQ/M5nQ9UDHNCazXg/Wd3YK7DAdAbDEAACcflHqHbo8AAAELelRYdE1PTCByZGtpdCAyMDIyLjA5LjEAAHicfVFLTsQwDN33FL5AI38SO1mwmLbDCKFpJRi4A3vuL5yikhkpwkkkO362814GqPa2vH59w5/xMgwA+M8upcCnIOJwherAdL68rDDfTtNxM28f6+0dDNQrfD0iT7ftetwQzMCBTYUVKCSyZBEw4G6tknccqnKUmhaWVDo4cRwGtVjUYMSQOAtxBxhh3fOozMnzUpSlNznBBiOFXCwh1RLSGMU6SPXZowS0rKV2MkXC3AHa/sioMecIo9MiSdzreF6XB7V+9Zu2dWn61cVNJg9AmhoeONX5+fLEjTn5SY0f+dFGgrzE7l9wP6/Gx0+7P/wAUL9tx414EZAAAACGelRYdFNNSUxFUyByZGtpdCAyMDIyLjA5LjEAAHicFY05DsMwDAS/ktIGKII3KQiu2Ccf8uMjtTM72O6++n6+v/68l6BkqAKjc3rCEqQIMSAkFfUqWISRNiNhELqUshxmYVUGYwesrrCOpRDx3eoMUduMsWb6KTnMdB8MRcqKuVcZxFRwv3/C2h3rm7uNYgAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with_meox[0][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "with open('NIST_ONSSiMe3.txt','w') as f:\n", + " for m in with_onssi:\n", + " f.write(m[0]+'\\n')\n", + " \n", + "with open('NIST_SiMe3.txt','w') as f:\n", + " for m in with_sime3:\n", + " f.write(m[0]+'\\n')\n", + " \n", + "with open('NIST_MeOX.txt','w') as f:\n", + " for m in with_meox:\n", + " f.write(m[0]+'\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Run the in-silico derivatization\n", + "\n", + "Iterate over the `mol[]` list (read from file above), remove derivatization groups from each entry, and try derivatization several times to leverage from the probabilistic behaviour). Assemble the results.\n", + "\n", + "This can be time consuming, expect about 5,000 entries per minute per core. Memory consumption can also grow to several GB." + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def process_one_mol(mol):\n", + " return (\n", + " mol[0],\n", + " Chem.MolToSmiles(remove_derivatization_groups(mol[1])),\n", + " { Chem.MolToSmiles(add_derivatization_groups(mol[1])) for _ in range(42) }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.99 ms, sys: 23.6 ms, total: 30.5 ms\n", + "Wall time: 45.4 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "with ProcessPoolExecutor(max_workers=cpus) as executor:\n", + " out = executor.map(process_one_mol, mols)\n", + " \n", + "out = list(out)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Write the main outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "with open('derivs_struct.tsv','w') as tsv:\n", + " tsv.write(\"orig\\tderiv. removed\\tderiv. added ...\\n\")\n", + " for orig,removed,added in out:\n", + " tsv.write(\"\\t\".join([orig,removed,*added]) + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "with open('derivs_flat.txt','w') as flat:\n", + " for orig,removed,added in out:\n", + " for one in { orig, removed, *added }:\n", + " flat.write(one + \"\\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "f8c929fcd037834ef32e0d432f6eb299eb751178a7a29a50d579d391b6611298" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/example/plotting.py b/example/plotting.py new file mode 100644 index 0000000..94c1884 --- /dev/null +++ b/example/plotting.py @@ -0,0 +1,22 @@ +from typing import Optional + +import py3Dmol +from rdkit.Chem import AllChem, Mol, MolToMolBlock +from rdkit.Chem.Draw.IPythonConsole import ShowMols + + +def draw3d(m: Mol, dimensions: tuple[int, int] = (500, 300), p: Optional[py3Dmol.view] = None): + AllChem.EmbedMultipleConfs(m, clearConfs=True, numConfs=50) + opt = AllChem.MMFFOptimizeMoleculeConfs(m) + conf = min(range(len(opt)), key=lambda x: opt[x][1] if opt[x][0] == 0 else float("inf")) + mb = MolToMolBlock(m, confId=conf) + + if p is None: + p = py3Dmol.view(width=dimensions[0], height=dimensions[1]) + + p.removeAllModels() + p.addModel(mb, 'sdf') + p.setStyle({'stick': {}}) + p.setBackgroundColor('0xeeeeee') + p.zoomTo() + return p.show() diff --git a/gc_meox_tms/__init__.py b/gc_meox_tms/__init__.py new file mode 100644 index 0000000..a1f549c --- /dev/null +++ b/gc_meox_tms/__init__.py @@ -0,0 +1,4 @@ +from .derivatization import (add_derivatization_groups, is_derivatized, + process_one_mol, remove_derivatization_groups) + +__all__ = ["add_derivatization_groups", "is_derivatized", "process_one_mol"] diff --git a/gc_meox_tms/__main__.py b/gc_meox_tms/__main__.py new file mode 100755 index 0000000..efad62c --- /dev/null +++ b/gc_meox_tms/__main__.py @@ -0,0 +1,43 @@ +import argparse +import sys +from concurrent.futures import ProcessPoolExecutor +from functools import partial + +from .derivatization import process_one_mol +from .utils import read_input_txt, write_flat, write_tab_separated + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('-n', '--ncpu', type=int, action='store', help='# of cores to use', default=1) + parser.add_argument('-r', '--repeat', type=int, action='store', + help='# of repeated attempts to derivatize (may return different results)', default=42) + parser.add_argument('-k', '--keep', action='store_true', + help='keep input and stripped derivatization SMILES in output', default=False) + parser.add_argument('-f', '--flat', type=str, action='store', help='flat output file, one SMILES per line') + parser.add_argument('-t', '--tsv', type=str, action='store', + help='structured output tsv file (original, stripped derivatization, added derivatizations') + parser.add_argument('infiles', nargs='+', type=str, action='store', help='input files') + + return parser.parse_args(argv) + + +def main(argv): + args = parse_arguments(argv) + input_molecules = read_input_txt(args.infiles) + + process_one_mol_with_repeats = partial(process_one_mol, repeats=args.repeat) + with ProcessPoolExecutor(max_workers=args.ncpu) as executor: + data = executor.map(process_one_mol_with_repeats, input_molecules) + + if args.flat: + write_flat(args.flat, data, args.keep) + if args.tsv: + write_tab_separated(args.tsv, data) + + return 0 + + +if __name__ == '__main__': + main(argv=sys.argv[1:]) diff --git a/gc_meox_tms/derivatization.py b/gc_meox_tms/derivatization.py new file mode 100755 index 0000000..d1c6ced --- /dev/null +++ b/gc_meox_tms/derivatization.py @@ -0,0 +1,138 @@ +import random +from copy import deepcopy +from typing import Optional, Tuple + +from rdkit import Chem +from rdkit.Chem import AllChem + +tms = '[Si]([CH3])([CH3])[CH3]' + +# XXX: ~[O,N,S] would match more than we aim to (-O, -S, -N, =N) but it's unlikely to happen +tms_match = Chem.MolFromSmarts('*~[O,N,S]' + tms) +tms_match0 = Chem.MolFromSmarts('[#0]([CH3])([CH3])[CH3]') + +meox_match_co = Chem.MolFromSmarts('C([C,c])([C,c])=NO[CH3]') +meox_match_cho = Chem.MolFromSmarts('[CH]([C,c])=NO[CH3]') +meox_match0 = Chem.MolFromSmarts('[#0]=NO[CH3]') +co = Chem.MolFromSmiles('C=O') + + +def is_derivatized(mol: Optional[Chem.Mol] = None, smiles: Optional[str] = None) -> bool: + """ + Return whether a molecule is derivatized by searching for MeOX and TMS substructures within that molecule. + + :param mol: RDKit molecule object + :param smiles: SMILES string + + :return: True if derivatized, False otherwise + """ + if mol is None: + mol = Chem.MolFromSmiles(smiles) + mol = Chem.AddHs(mol) + return (mol.HasSubstructMatch(tms_match) or + mol.HasSubstructMatch(meox_match_co) or + mol.HasSubstructMatch(meox_match_cho)) + + +def remove_derivatization_groups(mol: Optional[Chem.Mol] = None, smiles: Optional[str] = None) -> Chem.Mol: + """ + If a molecule is derivatized, remove derivatization substructures and return the original underivatized molecule. + + :param mol: RDKit molecule object + :param smiles: SMILES string + + :return: RDKit molecule object in underivatized (original) form + """ + if mol is None: + em = Chem.MolFromSmiles(smiles) + else: + em = deepcopy(mol) + + matches = em.GetSubstructMatches(tms_match) + for ma in matches: + em.GetAtomWithIdx(ma[2]).SetAtomicNum(0) + + em = AllChem.DeleteSubstructs(em, tms_match0) + + matches = em.GetSubstructMatches(meox_match_co) + for ma in matches: + em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) + matches = em.GetSubstructMatches(meox_match_cho) + for ma in matches: + em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) + + em = AllChem.ReplaceSubstructs(em, meox_match0, co, replaceAll=True)[0] + Chem.SanitizeMol(em) + return em + + +# (match pattern, dummy atom #, probability) +_subs = [ + ('[OH]', [100], [.95]), + ('[SH]', [101], [.80]), + # matches also imine + ('[NH]', [102], [.50]), + ('[NH2]', [103, 102], [.25, .5]), + ('C([C,c])([C,c])=O', [104], [.90]), + ('[CH]=O', [104], [.90]), +] + +# (dummy atom #, replacement) +_repls = [ + ('[#100]', 'O' + tms), + ('[#101]', 'S' + tms), + ('[#102]', 'N' + tms), + ('[#103]', f'N({tms}){tms}'), + ('[#104]=O', 'C=NO[CH3]'), +] + +subs = [(Chem.MolFromSmarts(pat), repls, probs) for pat, repls, probs in _subs] +repls = [(Chem.MolFromSmarts(pat), Chem.MolFromSmiles(repl)) for pat, repl in _repls] + + +def add_derivatization_groups(mol: Optional[Chem.Mol] = None, smiles: Optional[str] = None) -> Chem.Mol: + """ + Add derivatization substructures to a molecule and return its derivatized form. This function is not deterministic + and will return a random derivatized form of the molecule. Run multiple times to get all possible derivatized forms. + + :param mol: RDKit molecule object + :param smiles: SMILES string + + :return: RDKit molecule object in a derivatized form + """ + if mol is None: + mol = Chem.MolFromSmiles(smiles) + + em = deepcopy(mol) + + for pat, reps, probs in subs: + matches = em.GetSubstructMatches(pat) + for m in matches: + r = random.random() + for repl, prob in zip(reps, probs): + if r < prob: + em.GetAtomWithIdx(m[0]).SetAtomicNum(repl) + break + + for pat, repl in repls: + em = AllChem.ReplaceSubstructs(em, pat, repl, replaceAll=True)[0] + + Chem.SanitizeMol(em) + return em + + +def process_one_mol(mol: Tuple[str, Chem.Mol], repeats: int): + """ + Return derivatized and underivatized forms of one molecule. Since underlying function is not deterministic, this + function may or may not return all possible derivatized forms of the molecule depending on the number of repeats. + + :param mol: SMILES string or RDKit molecule object + :param repeats: number of repeats to simulate molecule derivatization + + :return: tuple containing the input molecule, its underivatized form, and a set of derivatized forms + """ + return ( + mol[0], + Chem.MolToSmiles(remove_derivatization_groups(mol[1]), kekuleSmiles=True), + {Chem.MolToSmiles(add_derivatization_groups(mol[1]), kekuleSmiles=True) for _ in range(repeats)} + ) diff --git a/gc_meox_tms/utils.py b/gc_meox_tms/utils.py new file mode 100644 index 0000000..91412e9 --- /dev/null +++ b/gc_meox_tms/utils.py @@ -0,0 +1,47 @@ +import fileinput +from os import PathLike +from typing import List, Tuple + +from rdkit.Chem import Mol, MolFromSmiles + + +def read_input_txt(infiles: PathLike) -> List[Tuple[str, Mol]]: + """ + Read input from txt files with SMILES. + + :param infiles: Path to input file(s) with SMILES. One SMILES per line. + + :return: List of tuples (molecule string from the input file, RDKit molecule object of that molecule) + """ + return [(line.rstrip(), MolFromSmiles(line)) for line in fileinput.input(files=infiles)] + + +def write_tab_separated(tsv_path: PathLike, data) -> None: + """ + Write output to a tab-separated file. + + :param tsv_path: Path to output file. + :param data: Tuple of (original SMILES, underivatized SMILES, set of derivatized SMILES) + """ + with open(tsv_path, "w") as tsv: + tsv.write("orig\tderiv. removed\tderiv. added ...\n") + for orig, removed, added in data: + tsv.write("\t".join([orig, removed, *added]) + "\n") + + +def write_flat(txt_path: PathLike, data, keep: bool = False) -> None: + """ + Write output to a txt file with one SMILES per line. + + :param txt_path: Path to output file. + :param data: Tuple of (original SMILES, underivatized SMILES, set of derivatized SMILES) + :param keep: Whether to write the original and underivatized SMILES to the output. + """ + with open(txt_path, "w") as flat: + if keep: + for orig, removed, added in data: + for one in {orig, removed, *added}: + flat.write(one + "\n") + else: + for orig, removed, added in data: + flat.write("\n".join(added) + "\n") diff --git a/pyproject.toml b/pyproject.toml index 374b58c..2e0ead7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,42 @@ -[build-system] -requires = [ - "setuptools>=42", - "wheel" -] -build-backend = "setuptools.build_meta" +[build-system] +requires = ["setuptools>=61.2", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "gc-meox-tms" +version = "1.0.1" +authors = [ + {name = "Ales Krenek", email = "ljocha@ics.muni.cz"}, + {name = "Maksym Skoryk", email = "maksym.skoryk@recetox.muni.cz"}, +] +maintainers = [{name = "RECETOX", email = "GalaxyToolsDevelopmentandDeployment@space.muni.cz"}] +description = "In-silico MeOX/TMS derivatization of chemical compounds" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +urls = {Homepage = "https://github.com/RECETOX/gc-meox-tms"} +requires-python = ">=3.8" +dependencies = ["rdkit"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.optional-dependencies] +dev = [ + "flake8", + "mock", + "pytest", + "pytest-cov", + "pytest-rerunfailures", +] +eda = [ + "jupyter", + "py3Dmol", +] + +[tool.setuptools] +packages = ["gc_meox_tms"] +include-package-data = false diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..3225598 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +flake8 +isort +mock +pytest +pytest-cov +pytest-rerunfailures diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2924aa1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +rdkit diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 570a139..0000000 --- a/setup.cfg +++ /dev/null @@ -1,27 +0,0 @@ -[metadata] -name = gc-meox-tms -version = 0.0.1 -author = Ales Krenek -author_email = ljocha@ics.muni.cz -description = In-sillico derivatization of compounds to be identified in gass chromatography -long_description = file: README-package.md -long_description_content_type = text/markdown -url = https://github.com/ljocha/gc-derivatization -project_urls = -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: MIT License - Operating System :: OS Independent - -[options] -package_dir = - = src -packages = find: -python_requires = >=3.6 -scripts = - bin/gc-meox-tms.py -install_requires = - rdkit-pypi - -[options.packages.find] -where = src diff --git a/src/gc_meox_tms/__init__.py b/src/gc_meox_tms/__init__.py deleted file mode 100644 index 6f0f1b5..0000000 --- a/src/gc_meox_tms/__init__.py +++ /dev/null @@ -1,98 +0,0 @@ -from rdkit import Chem -from rdkit.Chem import AllChem -from copy import deepcopy -import random - -tms = '[Si]([CH3])([CH3])[CH3]' - -# XXX: ~[O,N,S] would match more than we aim to (-O, -S, -N, =N) but it's unlikely to happen -tms_match = Chem.MolFromSmarts('*~[O,N,S]' + tms) -tms_match0 = Chem.MolFromSmarts('[#0]([CH3])([CH3])[CH3]') - -meox_match_co = Chem.MolFromSmarts('C([C,c])([C,c])=NO[CH3]') -meox_match_cho = Chem.MolFromSmarts('[CH]([C,c])=NO[CH3]') -meox_match0 = Chem.MolFromSmarts('[#0]=NO[CH3]') -co = Chem.MolFromSmiles('C=O') - -def is_derivatized(mol=None,smiles=None): - if mol is None: - mol = Chem.MolFromSmiles(smiles) - mol = Chem.AddHs(mol) - return mol.HasSubstructMatch(tms_match) or mol.HasSubstructMatch(meox_match_co) or mol.HasSubstructMatch(meox_match_cho) - -def remove_derivatization_groups(mol=None,smiles=None): - if mol is None: - em = Chem.MolFromSmiles(smiles) - else: - em = deepcopy(mol) - - matches = em.GetSubstructMatches(tms_match) - for ma in matches: - em.GetAtomWithIdx(ma[2]).SetAtomicNum(0) - - em = AllChem.DeleteSubstructs(em,tms_match0) - - matches = em.GetSubstructMatches(meox_match_co) - for ma in matches: - em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) - matches = em.GetSubstructMatches(meox_match_cho) - for ma in matches: - em.GetAtomWithIdx(ma[0]).SetAtomicNum(0) - - em, = AllChem.ReplaceSubstructs(em,meox_match0,co,replaceAll=True) - Chem.SanitizeMol(em) - return em - -# (match pattern, dummy atom #, probability) -_subs = [ - ('[OH]', [100], [.95]), - ('[SH]', [101], [.80]), -# matches also imine - ('[NH]', [102], [.50]), - ('[NH2]', [103,102], [.25, .5]), - ('C([C,c])([C,c])=O', [104], [.90]), - ('[CH]=O', [104], [.90]), -] - -# (dummy atom #, replacement) -_repls = [ - ('[#100]', 'O'+tms), - ('[#101]', 'S'+tms), - ('[#102]', 'N'+tms), - ('[#103]', f'N({tms}){tms}'), - ('[#104]=O', 'C=NO[CH3]'), -] - -#repls = list(zip( -# map(lambda n: Chem.MolFromSmarts(f'[#{n}]'),_repls), -# map(Chem.MolFromSmiles,_repls.values()) -#)) - -subs = [ (Chem.MolFromSmarts(pat), repls, probs) for pat,repls,probs in _subs] -repls = [ (Chem.MolFromSmarts(pat), Chem.MolFromSmiles(repl)) for pat,repl in _repls] - - - -def add_derivatization_groups(mol=None,smiles=None): - if mol is None: - mol = Chem.MolFromSmiles(smiles) - - em = deepcopy(mol) - - for pat,reps,probs in subs: - matches = em.GetSubstructMatches(pat) -# print(matches) - for m in matches: - r = random.random() - for repl,prob in zip(reps,probs): - if r < prob: - em.GetAtomWithIdx(m[0]).SetAtomicNum(repl) - break - - for pat,repl in repls: -# print(Chem.MolToSmiles(pat),Chem.MolToSmiles(repl),Chem.MolToSmiles(em)) - em, = AllChem.ReplaceSubstructs(em,pat,repl,replaceAll=True) - - Chem.SanitizeMol(em) - return em - diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c3244a6 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="module") +def test_dir(request): + """Return the directory of the currently running test script.""" + return Path(request.fspath).parent diff --git a/tests/data/acidic_protons.txt b/tests/data/acidic_protons.txt new file mode 100644 index 0000000..17d1453 --- /dev/null +++ b/tests/data/acidic_protons.txt @@ -0,0 +1,2 @@ +CC(=O)O +C(C(C(=O)O)N)S diff --git a/tests/data/alcohols.txt b/tests/data/alcohols.txt new file mode 100644 index 0000000..5a2e4f3 --- /dev/null +++ b/tests/data/alcohols.txt @@ -0,0 +1,2 @@ +CCO +CO \ No newline at end of file diff --git a/tests/data/aldehydes.txt b/tests/data/aldehydes.txt new file mode 100644 index 0000000..c87890b --- /dev/null +++ b/tests/data/aldehydes.txt @@ -0,0 +1,3 @@ +CC=O +COC1=C(C=CC(=C1)C=O)O +C(C1C(C(C(C(O1)O)O)O)O)O diff --git a/tests/data/ketones.txt b/tests/data/ketones.txt new file mode 100644 index 0000000..3332c72 --- /dev/null +++ b/tests/data/ketones.txt @@ -0,0 +1,5 @@ +CC(=O)C +CCC(=O)C +CC(=O)CC(=O)C +CC1CCCCCCCCCCCCC(=O)C1 +C1CCC(=O)CC1 diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..1116eed --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,85 @@ +from inspect import signature +from os.path import exists, join +from random import sample + +import pytest +from mock import ANY, patch + +from gc_meox_tms.__main__ import main + + +@pytest.fixture +def input_data_path(test_dir): + """Return the directory of the currently running test script as string type.""" + return join(test_dir, 'data/aldehydes.txt') + + +@pytest.fixture(params=[['-f'], ['-t'], ['-f', '-t']]) +def output_params(request, tmp_path): + """Return a list of output parameters.""" + args = [] + flat_path, tsv_path = None, None + for flag in request.param: + if flag == '-f': + args.append(flag) + args.append(flat_path := join(tmp_path, 'flat.txt')) + elif flag == '-t': + args.append(flag) + args.append(tsv_path := join(tmp_path, 'tsv.txt')) + yield args, flat_path, tsv_path + + +def test_cli_finishes(input_data_path): + """Test if the main function works.""" + args = [input_data_path] + exit_code = main(args) + + assert exit_code == 0 + + +def test_cli_writes_files(input_data_path, output_params, tmp_path): + args = output_params[0] + args.append(input_data_path) + + flat_path = output_params[1] + tsv_path = output_params[2] + exit_code = main(args) + + assert exit_code == 0 + assert flat_path is None or exists(flat_path) + assert tsv_path is None or exists(tsv_path) + + +@pytest.mark.parametrize('keep', [True, False]) +@patch('gc_meox_tms.__main__.write_flat') +def test_keep_flag(mock, input_data_path, tmp_path, keep): + """Test if the main function works with -k flag.""" + flat_path = join(tmp_path, 'flat.txt') + args = [input_data_path, '-f', flat_path] + if keep: + args.append('-k') + + main(args) + + mock.assert_called_with(flat_path, ANY, keep) + + +@pytest.mark.parametrize('num_workers', sample(range(1, 10), 3)) +@patch('gc_meox_tms.__main__.ProcessPoolExecutor') +def test_ncpu_flag(mock, input_data_path, num_workers): + """Test if the main function works with -n flag.""" + args = [input_data_path, '-n', str(num_workers)] + main(args) + + mock.assert_called_with(max_workers=num_workers) + + +@pytest.mark.parametrize('repeats', sample(range(1, 50), 3)) +@patch('gc_meox_tms.__main__.ProcessPoolExecutor.map') +def test_repeats_flag(mock, input_data_path, repeats): + """Test if the main function works with -r flag.""" + args = [input_data_path, '-r', str(repeats)] + main(args) + + called_with_repeats = signature(mock.call_args[0][0]).parameters["repeats"].default + assert called_with_repeats == repeats diff --git a/tests/test_derivatization.py b/tests/test_derivatization.py new file mode 100644 index 0000000..85f250d --- /dev/null +++ b/tests/test_derivatization.py @@ -0,0 +1,135 @@ +import random + +import pytest +from rdkit import Chem + +from gc_meox_tms import (add_derivatization_groups, is_derivatized, + process_one_mol, remove_derivatization_groups) + + +@pytest.fixture(params=[ + ("CC(=O)N([Si](C)(C)C)[Si](C)(C)C", True), + ("C[Si](C)(C)OC1=CC=CC=C1", True), + ("C[Si](C)(C)OC1=CC=C(C=C1)O[Si](C)(C)C", True), + ("C[Si](C)(C)C1=CC=C(C=C1)[Si](C)(C)C", False), + ("CCO[Si](C)(C)C", True), + ("CC(=O)O[Si](C)(C)C", True), + ("CC(=O)O", False), + ("CCCS[Si](C)(C)C", True), + ("CCCS", False), + ("CCC(=NOC)C", True), + ("CC=NOC", True), + ("CCCC(=O)N", False), + ("CCCC(=O)NCC", False), + ("CC(=O)NOC", False), + ("CCC(O)C", False), + ("CCCC#N", False), + ("C[N+]#[C-]", False) +]) +def is_derivatized_data(request): + """Return a tuple of (smiles, boolean indicating if the molecule + is MeOX or TMS derivatized).""" + smiles, _is_derivatized = request.param + return smiles, _is_derivatized + + +@pytest.fixture(params=[ + ("CC(=O)N([Si](C)(C)C)[Si](C)(C)C", "CC(=O)N[Si](C)(C)C", "CC(N)=O"), + ("C[Si](C)(C)OC1=CC=CC=C1", None, "OC1=CC=CC=C1"), + ("C[Si](C)(C)OC1=CC=C(O[Si](C)(C)C)C=C1", "C[Si](C)(C)OC1=CC=C(O)C=C1", + "OC1=CC=C(O)C=C1"), + ("CCO[Si](C)(C)C", None, "CCO"), + ("CC(=O)O[Si](C)(C)C", None, "CC(=O)O"), + ("CCCS[Si](C)(C)C", None, "CCCS"), + ("CCC(C)=NOC", None, "CCC(C)=O"), + ("CC=NOC", None, "CC=O") +]) +def derivatization_groups_data(request): + """Return a tuple of (smiles of a derivatized molecule, smiles of this + molecule with different degree of conversion, smiles of the original + non-derivatized molecule).""" + derivatized, alternative, original = request.param + return derivatized, alternative, original + + +def test_is_derivatized_from_smiles(is_derivatized_data): + """Test if the is_derivatized function works with SMILES.""" + smiles, expected = is_derivatized_data + actual = is_derivatized(smiles=smiles) + + assert actual == expected + + +def test_is_derivatized_from_mol(is_derivatized_data): + """Test if the is_derivatized function works with RDKit molecules.""" + smiles, expected = is_derivatized_data + mol = Chem.MolFromSmiles(smiles) + actual = is_derivatized(mol=mol) + + assert actual == expected + + +def test_remove_derivatization_groups_from_smiles(derivatization_groups_data): + """Test if the remove_derivatization_groups function works with SMILES.""" + smiles, _, expected = derivatization_groups_data + actual = remove_derivatization_groups(smiles=smiles) + actual_smiles = Chem.MolToSmiles(actual, kekuleSmiles=True) + + assert actual_smiles == expected + + +def test_remove_derivatization_groups_from_mol(derivatization_groups_data): + """Test if the remove_derivatization_groups function works with RDKit + molecules.""" + smiles, _, expected = derivatization_groups_data + mol = Chem.MolFromSmiles(smiles) + actual = remove_derivatization_groups(mol=mol) + actual_smiles = Chem.MolToSmiles(actual, kekuleSmiles=True) + + assert actual_smiles == expected + + +def test_add_derivatization_groups_from_smiles(derivatization_groups_data): + """Test if the add_derivatization_groups function works with SMILES. The + test will run FLAKY_RERUNS times or until success due to + non-deterministic nature of add_derivatization_groups.""" + random.seed(3) + expected, alternative, original = derivatization_groups_data + derivatized = add_derivatization_groups(smiles=original) + derivatized_smiles = Chem.MolToSmiles(derivatized, kekuleSmiles=True) + + assert derivatized_smiles in [expected, alternative] + + +def test_add_derivatization_groups_from_mol(derivatization_groups_data): + """Test if the add_derivatization_groups function works with RDKit + molecules. The test will run FLAKY_RERUNS times or until success + due to non-deterministic nature of add_derivatization_groups.""" + random.seed(3) + expected, alternative, original = derivatization_groups_data + mol = Chem.MolFromSmiles(original) + derivatized = add_derivatization_groups(mol=mol) + derivatized_smiles = Chem.MolToSmiles(derivatized, kekuleSmiles=True) + + assert derivatized_smiles in [expected, alternative] + + +@pytest.mark.parametrize("smiles, expected", [ + ("CC(N)=O", {"CC(N)=O", + "CC(=O)N([Si](C)(C)C)[Si](C)(C)C", + "CC(=O)N[Si](C)(C)C"}), + ("C[Si](C)(C)OC1=CC=C(O)C=C1", {"OC1=CC=C(O)C=C1", + "C[Si](C)(C)OC1=CC=C(O[Si](C)(C)C)C=C1", + "C[Si](C)(C)OC1=CC=C(O)C=C1"}), + ("CCC(C)=O", {"CCC(C)=O", "CCC(C)=NOC"}), + ("CC=NOC", {"CC=O", "CC=NOC"}) +]) +def test_process_one_mol(smiles, expected): + """Test processing one molecule.""" + mol = (smiles, Chem.MolFromSmiles(smiles)) + n = 40 + random.seed(3) + actual = process_one_mol(mol, n) + actual = {actual[0], actual[1], *actual[2]} + + assert actual == expected diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..d251785 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,86 @@ +from concurrent.futures import ProcessPoolExecutor +from functools import partial + +import pytest +from rdkit.Chem import Mol, MolFromSmiles + +from gc_meox_tms import process_one_mol +from gc_meox_tms.utils import read_input_txt, write_flat, write_tab_separated + + +@pytest.fixture +def data(): + molecules = [(smiles, MolFromSmiles(smiles)) for smiles in [ + "CCC(=NOC)C", "CCC=NOC", "C=NOC", "CC(=O)N([Si](C)(C)C)[Si](C)(C)C"]] + + process_one_mol_with_repeats = partial(process_one_mol, repeats=1) + with ProcessPoolExecutor(max_workers=2) as executor: + data = executor.map(process_one_mol_with_repeats, molecules) + + yield data + + +@pytest.mark.parametrize("path, smiles", [ + ("data/acidic_protons.txt", ["CC(=O)O", "C(C(C(=O)O)N)S"]), + ("data/alcohols.txt", ["CCO", "CO"]), + ("data/ketones.txt", ["CC(=O)C", "CCC(=O)C", "CC(=O)CC(=O)C", "CC1CCCCCCCCCCCCC(=O)C1", "C1CCC(=O)CC1"]) +]) +def test_reading_input_from_txt(path, test_dir, smiles): + """Test reading input from txt files.""" + molecules = read_input_txt(test_dir / path) + actual_smiles = [mol[0] for mol in molecules] + rdkit_molecules = [mol[1] for mol in molecules] + + assert len(molecules) == len(smiles) + assert actual_smiles == smiles + assert all(isinstance(mol, Mol) for mol in rdkit_molecules) + + +def test_writing_flat_output(data, tmp_path): + """Test writing flat output.""" + flat_path = tmp_path / "flat.txt" + write_flat(flat_path, data, True) + + assert flat_path.exists() + + +def test_writing_flat_content(data, tmp_path): + """Test writing flat output content.""" + flat_path = tmp_path / "flat.txt" + write_flat(flat_path, data, True) + + with open(flat_path, "r") as f: + lines = f.readlines() + + assert len(lines) == 8 + + +def test_writing_flat_content_without_keep(data, tmp_path): + """Test writing flat output content without keep.""" + flat_path = tmp_path / "flat.txt" + write_flat(flat_path, data, False) + + with open(flat_path, "r") as f: + lines = f.readlines() + + assert len(lines) == 4 + + +def test_writing_tsv_output(data, tmp_path): + """Test writing tsv output.""" + tsv_path = tmp_path / "tsv.txt" + write_tab_separated(tsv_path, data) + + assert tsv_path.exists() + + +def test_writing_tsv_content(data, tmp_path): + """Test writing tsv output content.""" + tsv_path = tmp_path / "tsv.txt" + write_tab_separated(tsv_path, data) + + with open(tsv_path, "r") as f: + lines = f.readlines() + + assert len(lines) == 5 + assert lines[0] == "orig\tderiv. removed\tderiv. added ...\n"