diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..7aee77b --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,33 @@ +name: Publish Py Protein Inference to PyPI and TestPyPI + +on: [push] + +jobs: + build-n-publish: + name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI + runs-on: ubuntu-18.04 + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Set up Python 3.7 + uses: actions/setup-python@v3 + with: + python-version: 3.7 + - name: Install wheel + run: >- + pip install wheel + - name: Build a binary wheel and a source tarball + run: >- + python setup.py sdist bdist_wheel + - name: Publish distribution 📦 to Test PyPI + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + - name: Publish distribution 📦 to PyPI + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..69e2c24 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,39 @@ +name: Py Protein Inference + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10"] + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest black tox + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with tox/flake8 + run: | + tox -e lint + - name: Check format with tox/black + run: | + tox -e format + - name: Test with tox/pytest + run: | + tox -e test + - name: Build package + run: | + VER=$(python setup.py --version) + echo $VER + python setup.py sdist diff --git a/.gitignore b/.gitignore index 878149d..5452992 100644 --- a/.gitignore +++ b/.gitignore @@ -78,7 +78,6 @@ piq_output/ pepxmls/ percolator_output/ output/ -glpkinout/ data/ !tests/data/ !tests/output/ diff --git a/Dockerfile b/Dockerfile index 76c648f..fdec7e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,23 +6,6 @@ LABEL description="This container contains a an installation of PyProteinInferen LABEL maintainer="Trent Hinkle " LABEL version="$VERSION" -# Install glpk -RUN mkdir /build/ - -WORKDIR /build/ - -RUN wget ftp://ftp.gnu.org/gnu/glpk/glpk-4.65.tar.gz \ - && tar -xzvf glpk-4.65.tar.gz -WORKDIR /build/glpk-4.65 -RUN chmod +x ./configure \ - && ./configure \ - && make uninstall \ - && make install - -# Install libglpk-dev to get glpsol to work properly -RUN apt-get update \ - && apt-get install -y libglpk-dev - WORKDIR / # Install Py Protein Inference @@ -30,19 +13,12 @@ WORKDIR / RUN mkdir /pyproteininference # copy the entire protein_inference directory into the docker image -COPY /pyproteininference /pyproteininference/pyproteininference/ -COPY /tests /pyproteininference/tests/ -COPY setup.cfg /pyproteininference/ -COPY setup.py /pyproteininference/ -COPY /parameters /pyproteininference/parameters/ -COPY /scripts /pyproteininference/scripts/ -COPY .git /pyproteininference/.git/ -COPY README.md /pyproteininference/ -COPY requirements.txt /pyproteininference/ +COPY . /pyproteininference WORKDIR /pyproteininference/ -RUN mkdir /glpkinout RUN pip install -r requirements.txt -RUN python setup.py develop +RUN python setup.py install + +WORKDIR / diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7297ca0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2022 Genentech, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f282c17 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +exclude site +exclude images +exclude build \ No newline at end of file diff --git a/README.md b/README.md index 1fa4ee9..6c8e7ea 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ - -# Py Protein Inference +# Py Protein Inference -## Requirements +# Requirements 1. __Python 3.7__ or greater. This package was created using __Python 3.7__ 2. __Python Packages__: @@ -39,9 +38,9 @@ parameters: identifiers: decoy_symbol: "decoy_" ``` -The inference type can be one of: parsimony, peptide_centric, inclusion, exclusion, or first_protein. -All parameters are optional, so you only need to define the ones you want to alter. -See [here](#default-parameters) for the default parameters. +The inference type can be one of: `parsimony`, `peptide_centric`, `inclusion`, `exclusion`, or `first_protein`. +All parameters are optional, so you only need to define the ones you want to alter. Parameters that are not defined are set to default values. +See the package documentation for the default parameters. 5. Run the standard commandline tool again, this time specifying the parameters as above: ```shell @@ -52,753 +51,23 @@ protein_inference_cli.py \ -y /path/to/params.yaml ``` -6. All other parameters can be tweaked if needed. For more information please see the section [Yaml Parameter File](#yaml-parameter-file-outline). - -# Advanced Usage - -## Introduction -Py Protein Inference is a Python package that has the ability to run various protein inference algorithms on tandem mass spectrometry search results. -In addition to performing protein inference which maps peptides to proteins, this algorithm creates protein scores based on the supplied peptides and is able to calculate set based protein level false discovery rates for MS data filtering purposes. -Py Protein Inference typically takes as input the output PSM files from the [Percolator algorithm](https://github.com/percolator/percolator). -However, Py Protein Inference can also take custom tab delimited files as input. -As for output Py Protein Inference generates a user-friendly csv format file that includes the Proteins, Peptides, q-values, and Protein Scores. - -Py Protein Inference has the ability to run any of the following inference procedures from literature: - -1. Parsimony -2. Exclusion -3. Inclusion -4. Peptide Centric (Protein Group Level) -5. First Protein (Selects first protein per peptide) - -Please see the [__Inference Types__](#inference-types) section for more information on Inference Types. - -In Addition to these inference types Py Protein Inference can also score proteins with a variety of methods: - -1. Best Peptide Per Protein -2. Multiplicative Log -3. Top Two Combined -4. Additive -5. Iterative Downweighted Log -6. Downweighted Multiplicative Log -7. Geometric Mean - -Please see the [__Protein Score Types__](#protein-score-types) section for more information on scoring algorithms. - -## Using Py Protein Inference - 1. [Yaml Parameter File](#yaml-parameter-file-outline) - 2. [Input PSM files](#input-file-examples) (Tab Delimited) - 3. [Fasta Database](#fasta-file-example) - 4. [Running Py Protein Inference](#running-py-protein-inference) - -## Yaml Parameter File Outline -The Yaml Parameter File is the central location for all configurations for a given Protein Inference run and are summarized below: -Note: These parameters are all optional. Please see the section [Default Parameters](#default-parameters) for more information on defaults. -## General: -| Parameter | Description |Type| -|---|---|---| -| export | Export Type can be one of: __peptides__, __psms__, __psm_ids__, __long__, __q_value__, __q_value_all__, __q_value_comma_sep__, __leads__, __all__, __comma_sep__. Suggested types are __peptides__, __psms__, and __psm_ids__ as these produce square output. If there are multiple proteins per group the three mentioned types will report the leads only. Other types report on the peptide level with slightly different formats and whether or not to include leads only or all proteins. See [here](#export-explanations) for an in-depth explanation of Export Types. | String | -| fdr | False Discovery Rate to be marked as significant. Ex. __0.01__ for 1% FDR. | Numeric | -| picker | __True__/__False__ on whether to run the Protein Picker algorithm. For more info click [here](#protein-picker). | Bool | -| tag | A String tag that will be written into the result files. Ex. __example_tag__. | String | - -## Data Restriction: -| Parameter | Description |Type| -|---|---|---| -| pep_restriction | Posterior Error Probability values to filter (i.e. __0.9__). In this case PSMs with PEP values greater than __0.9__ would be removed from the input. If PEP values are not in input please use __None__. | Numeric | -| peptide_length_restriction | Peptide Length to filter on. (i.e. __7__). If no filter is wanted please use __None__. | Int | -| q_value_restriction | Q Values to filter. (i.e. __0.2__). In this case PSMs with Q Values greater than __0.2__ would be removed from the input. If Q Values are not in input please use __None__ . | Numeric | -| custom_restriction | Custom Value to filter. (i.e. __5__). In this case PSMs with Custom value greater than / less than __5__ would be removed from the input. If Not using a custom score please use __None__. __NOTE__: If a higher score is "better" for your score please set __psm_score_type__ to __additive__. If a lower score is "better" please set __psm_score_type__ parameter to __multiplicative__. | Numeric | - -## Score: -| Parameter | Description |Type| -|---|---|---| -| protein_score | One of any of the following: __multiplicative_log__, __best_peptide_per_protein__, __top_two_combined__, __additive__, __iterative_downweighted_log__, __downweighted_multiplicative_log__, __geometric_mean__. Recommended: __multiplicative_log__. | String | -| psm_score | PSM score to use for Protein Scoring. If using Percolator output as input this would either be __posterior_error_prob__ or __q-value__. The string typed here should match the column in your input files __EXACTLY__. If using a custom score it will be filtered accordingly with the value in [__custom_restriction__](#data-restriction). | String | -| psm_score_type | The Type of score that __psm_score__ parameter is. Either __multiplicative__ or __additive__. If a larger psm score is "better" than input additive (i.e. Mascot Ion Score, Xcorr, Percolator Score). If a smaller psm score is "better" than input multiplicative (i.e. Q Value, Posterior Error Probability). See [below](#extra-score-information) for more information.| String | - -#### Extra Score information: - 1. The __protein_score__, __psm_score__, and __psm_score_type__ methods must be compatible. - 2. If using a PSM score (__psm_score__ parameter) where the lower the score the better (i.e. __posterior_error_prob__ or __q-value__) then any __protein_score__ can be used except __additive__. __psm_score_type__ must also be set to __multiplicative__. - 3. If using a PSM score (__psm_score__ parameter) where the higher the score the better (i.e. Percolator Score, Mascot Ion Score, Xcorr) (Percolator Score is called __psm_score__ - column name) in the tab delimited percolator output. Then __protein_score__ and __psm_score_type__ must both be __additive__. - -## Identifiers: -| Parameter | Description |Type| -|---|---|---| -| decoy_symbol | Symbol within Decoy Identifiers to distinguish between targets. (i.e "__##__" or "__decoy___"). This is important for Protein [Picker](#protein-picker) and FDR calculation. | String | -| isoform_symbol | Symbol that is present in isoform proteins only. (i.e. "__-__"). See [below](#extra-identifier-information) for more information. | String | -| reviewed_identifier_symbol | Identifier to determine a reviewed vs unreviewed identifier. (i.e. "__sp\|__"). See [below](#extra-identifier-information) for more information. | String | - -#### Extra Identifier information: - 1. For the __decoy_symbol__ an example of a target protein -> __ex|protein__ and its decoy counterpart could be any of the following: __##ex|##protein__, __##ex|protein__, __decoy_ex|protein__. The decoy symbol just needs to be present within the string to be determined a decoy. - 2. For __isoform_symbol__ and __reviewed_identifier_symbol__, these are used to assign priority in certain algorithms such as parsimony. For example, if we have canonical proteins, isoform proteins, and reviewed/unreviewed proteins in a given analysis; the priority would be established as follows: Reviewed Canonical, Reviewed Isoform, Unreviewed. This means that if two proteins map to the same peptides, the algorithm has to make a decision on which to pick. It would use the previous mentioned priority to pick the protein lead to report. - -## Inference: -| Parameter | Description |Type| -|---|---|---| -| inference_type | The Inference procedure to apply to the analysis. This can be __parsimony__, __inclusion__, __exclusion__, __peptide_centric__, or __first_protein__. Please see [here](#inference-types) for more information on the inference types. | String | -| grouping_type | How to group proteins for a given __inference_type__. This can be __subset_peptides__, __shared_peptides__, or __None__. Typically __subset_peptides__ is used. This parameter only effects grouped proteins and has no impact on protein leads. | String | - -## Digest: -| Parameter | Description |Type| -|---|---|---| -| digest_type | The enzyme used for digestion for the MS searches. (i.e. 'trypsin'). Can be any expasy rule as defined here: https://pyteomics.readthedocs.io/en/latest/_modules/pyteomics/parser.html other common examples include: trypsin, chymotrypsin high specificity, chymotrypsin low specificity, lysc. | String | -| missed_cleavages | The number of missed cleavages allowed for the MS searches. (i.e. __2__) | Int | - -## Parsimony: -These parameters are only used if __parsimony__ is selected as __inference_type__. - -| Parameter | Description |Type| -|---|---|---| -| lp_solver | This can be one of: __pulp__, __glpk__, __None__. This determines which linear program solver is used. Please see [here](#parsimony-dependencies) for more information on lp solvers. Both options have external dependency requirements. Input __None__ if not running __parsimony__. | String | -| glpk_path | If glpk is selected as __lp_solver__ input the path to the commandline tool __glpsol__. This is typically just __glpsol__. | String | -| shared_peptides | How to assign shared peptides for parsimony. Can be one of: __all__ or __best__. __all__ assigns shared peptides to all possible proteins in the output. __best__ assigns shared peptides to the best scoring protein which is a "winner take all" approach. This is specific to the Parsimony Inference type. | String | - - -## Peptide Centric: -These parameters are only used if __peptide_centric__ is selected as __inference_type__. - -| Parameter | Description | Type | -|---|---|---| -| max_identifiers | The maximum number of proteins a peptide is allowed to map to. (i.e. __5__). This serves to limit the number of protein groups that can be created due to highly homologous peptides. | Int | - - -## Default Parameters -```yaml -parameters: - general: - export: peptides - fdr: 0.01 - picker: True - tag: py_protein_inference - data_restriction: - pep_restriction: 0.9 - peptide_length_restriction: 7 - q_value_restriction: 0.005 - custom_restriction: None - score: - protein_score: multiplicative_log - psm_score: posterior_error_prob - psm_score_type: multiplicative - identifiers: - decoy_symbol: "##" - isoform_symbol: "-" - reviewed_identifier_symbol: "sp|" - inference: - inference_type: peptide_centric - grouping_type: shared_peptides - digest: - digest_type: trypsin - missed_cleavages: 3 - parsimony: - lp_solver: pulp - glpk_path: glpsol - shared_peptides: all - peptide_centric: - max_identifiers: 5 -``` - -## Input File Examples -The standard input filetype is the tab delimited output from the percolator algorithm. Please see below for examples of input files: -#### Standard Percolator Output as Input -| PSMid | score | q-value | posterior_error_prob | peptide | proteinIds | | | | -|---|---|---|---|---|---|---|---|---| -| 1.1 | 7.5 | 0.0048 | 0.0007 | R.NYIQSLTQMPK.M | MK14_HUMAN\|Q16539 | MK14_HUMAN\|Q16539-2 | MK14_HUMAN\|Q16539-3 | | -| 1.2 | 6.2 | 0.0035 | 0.0006 | R.NTVASSSRSM*R.T | FHDC1_HUMAN\|Q9C0D6 | | | | - -With the above standard input one could use __q-value__ or __posterior_error_prob__ as the PSM score see [Score Section](#score) with __multiplicative__ as __psm_score_type__ and any of the multiplicative options for __protein_score__. - -For example standard input files please see any of the following: -`tests/data/test_perc_data_target.txt` -`tests/data/test_perc_data_decoy.txt` - -#### Custom Input -| PSMid | custom_score | peptide | proteinIds | | -|---|---|---|---|---| -| 1.1 | 7.5 | R.NYIQSLTQMPK.M | MK14_HUMAN\|Q16539 | MK14_HUMAN\|Q16539-2 | MK14_HUMAN\|Q16539-3 | | -| 1.2 | 6.2 | R.NTVASSSRSM*R.T | FHDC1_HUMAN\|Q9C0D6 | | | - -With the above custom input one could use one could use __custom_score__ as the PSM __psm_score__ with __additive__ as the __psm_score_type__ and __protein_score__. - -For example custom input files please see any of the following: -`tests/data/test_perc_data_target_additive.txt` -`tests/data/test_perc_data_decoy_additive.txt` -`tests/data/test_perc_data_target_multiplicative.txt` -`tests/data/test_perc_data_decoy_multiplicative.txt` - -## Fasta File Example -This package was developed using standard Fasta files from [Uniprot](https://www.uniprot.org/). -Please see an example entry in a Fasta database below: -``` ->sp|Q5QNW6|H2B2F_HUMAN Histone H2B type 2-F OS=Homo sapiens OX=9606 GN=H2BC18 PE=1 SV=3 -MPDPAKSAPAPKKGSKKAVTKVQKKDGKKRKRSRKESYSVYVYKVLKQVHPDTGISSKAM -GIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVT -KYTSSK -``` - -## Running Py Protein Inference - 1. [__Command Line__](#running-via-command-line) - 2. [__Within Python__](#running-within-python) - 3. [__Heuristic__](#running-heuristic) - -### Running Via Command Line -Upon proper installation of the package, the command line tool should be installed and _should_ be available from any location on the system. -The command line tool can be called as follows: -```shell -protein_inference_cli.py --help -``` -This will return the help prompt for the tool. -If this does not work download `protein_inference_cli.py` from our repository and write the full path to the script while also calling `python`: -```shell -python /path/to/directory/pyproteininference/scripts/protein_inference_cli.py --help -``` - -Command line options are as follows: -``` -cli$ python protein_inference_cli.py --help -usage: protein_inference_cli.py [-h] [-t FILE [FILE ...]] [-d FILE [FILE ...]] - [-f FILE [FILE ...]] [-o DIR] [-l FILE] - [-a DIR] [-b DIR] [-c DIR] [-db FILE] - [-y FILE] [-p] [-i] - -Protein Inference - -optional arguments: - -h, --help show this help message and exit - -t FILE [FILE ...], --target FILE [FILE ...] - Input target psm output from percolator. Can either - input one file or a list of files. - -d FILE [FILE ...], --decoy FILE [FILE ...] - Input decoy psm output from percolator. Can either - input one file or a list of files. - -f FILE [FILE ...], --combined_files FILE [FILE ...] - Input combined psm output from percolator. This should - contain Target and Decoy PSMS. Can either input one - file or a list of files. - -o DIR, --output DIR Result Directory to write to - the name of file will - be determined by parameters selected and tag - parameter. If this option is not set, will write - results to current working directory. - -l FILE, --output_filename FILE - Filename to write results to. Can be left blank. If - this flag is left blank the filename will be - automatically generated. If set this flag will - override -o. - -a DIR, --target_directory DIR - Directory that contains either .txt or .tsv input - target psm data. Make sure the directory ONLY contains - result files. - -b DIR, --decoy_directory DIR - Directory that contains either .txt or .tsv input - decoy psm data. Make sure the directory ONLY contains - result files. - -c DIR, --combined_directory DIR - Directory that contains either .txt or .tsv input data - with targets/decoys combined. Make sure the directory - ONLY contains result files. - -db FILE, --database FILE - Path to the fasta formatted database used in the MS - search. This is optional. If not set, will use the - proteins only in the input files. - -y FILE, --yaml_params FILE - Path to a Protein Inference Yaml Parameter File. If - this is not set, default parameters will be used. - -p, --skip_append_alt - Advanced usage only. If this flag is set, will skip - adding alternative proteins to each PSM from the - database digest. If this flag is not set, the - peptide/protein mapping will be taken from database - digest and appended to the mapping present in the - input files. - -i, --id_splitting Advanced usage only. If set this flag will split - protein identifiers.If not set, this flag will not - split protein identifiers.Sometimes the fasta database - protein IDs are formatted as: 'sp|ARAF_HUMAN|P10398'. - While protein IDs in the input files are formatted as - 'ARAF_HUMAN|P10398'. Setting This flag will split off - the front 'sp|' or 'tr|' from the database protein - identifiers. - -``` - -The following combinations of input are allowed and at least one combination is required: - -1) `-t -d` Path to input target (`-t`) and decoy (`-d`) files. This can be one target and one decoy file or multiple files separated by spaces (" "). -2) `-a -b` Path to input target (`-a`) and decoy (`-b`) directories that contain target and decoy files. This is one directory each and all .txt and .tsv files will be read in as input. -3) `-f` Path to input combined target/decoy (`-f`) files. This can be one file or multiple files separated by spaces (" "). -4) `-c` Path to input combined target/decoy (`-a`) directory that contain combined target/decoy files. This is one directory each and all .txt and .tsv files will be read in as input. - -Any other combinations will result in an Error raised. - -Optional flags - -1) `-db` Path to Fasta Database file. -2) `-y` Path to Protein Inference Yaml Parameter file. (If this is not supplied default parameters will be used). -3) `-o` Path to the output directory, if this is left blank files will be written to the current working directory. -4) `-l` Path to the output filename, if this is left blank a filename will be automatically generated and will be written to directory as set in `-o`. Will override `-o` flag if set. - -Advanced usage flags - -1) `-p` This flag is a True/False on whether to skip appending alternative proteins from the Fasta database digestion. If this flag is left blank, it will not skip appending alternative proteins (recommended). -2) `-i` True/False on whether to split the IDs in the Fasta database file. If this is left blank, it will not split IDs in the Fasta database file (recommended). - -You can run the tool as follows: -```shell -protein_inference_cli.py \ --t /path/to/target/file.txt \ --d /path/to/decoy/file.txt \ --db /path/to/database/file.fasta \ --y /path/to/parameter/file.yaml \ --o /path/to/output/directory/ -``` - -Running with multiple input target/decoy files: -```shell -protein_inference_cli.py \ --t /path/to/target/file1.txt /path/to/target/file2.txt \ --d /path/to/decoy/file1.txt /path/to/decoy/file2.txt \ --db /path/to/database/file.fasta \ --y /path/to/parameter/file.yaml \ --o /path/to/output/directory/ -``` - - -### Running Within Python -To run within a python console please see the following example: -```python -from pyproteininference.pipeline import ProteinInferencePipeline - -yaml_params = "/path/to/yaml/params.yaml" -database = "/path/to/database/file.fasta" -### target_files can either be a list of files or one file -target_files = ["/path/to/target1.txt","/path/to/target2.txt"] -### decoy_files can either be a list of files or one file -decoy_files = ["/path/to/decoy1.txt","/path/to/decoy2.txt"] -output_directory_name = "/path/to/output/directory/" - -pipeline = ProteinInferencePipeline(parameter_file=yaml_params, - database_file=database, - target_files=target_files, - decoy_files=decoy_files, - combined_files=None, - output_directory=output_directory_name) -# Calling .execute() will initiate the pipeline with the given data -pipeline.execute() -``` - -### Running Heuristic -Py Protein Inference also has a built-in Heuristic that runs through four inference methods (Inclusion, Exclusion, Parsimony, and Peptide Centric) and selects a recommended method for your given dataset. -By default, all four result files will be written, and the optimal method will be highlighted to the user. -The Heuristic method also outputs a ROC plot that showcases all the inference methods compared to one another to gain further insight. For more information on this Heuristic Method see the [__Heuristic__](#running-heuristic) section. - -#### Running the Heuristic Method via the Command Line -```shell -python protein_inference_heuristic_cli.py --help -``` -This will return the help prompt for the tool. -If this does not work download `protein_inference_heuristic_cli.py` from the repository and write the full path to the script while also calling `python`. -```shell -python /path/to/directory/pyproteininference/scripts/protein_inference_heuristic_cli.py --help -``` - -Command line options are as follows: -``` -cli$ python protein_inference_heuristic_cli.py --help -usage: protein_inference_heuristic_cli.py [-h] [-t FILE [FILE ...]] - [-d FILE [FILE ...]] - [-f FILE [FILE ...]] [-o DIR] - [-l FILE] [-a DIR] [-b DIR] [-c DIR] - [-db FILE] [-y FILE] [-p] [-i] - [-r FILE] [-m FLOAT] [-u STR] - -Protein Inference Heuristic - -optional arguments: - -h, --help show this help message and exit - -t FILE [FILE ...], --target FILE [FILE ...] - Input target psm output from percolator. Can either - input one file or a list of files. - -d FILE [FILE ...], --decoy FILE [FILE ...] - Input decoy psm output from percolator. Can either - input one file or a list of files. - -f FILE [FILE ...], --combined_files FILE [FILE ...] - Input combined psm output from percolator. This should - contain Target and Decoy PSMS. Can either input one - file or a list of files. - -o DIR, --output DIR Result Directory to write to - the name of file will - be determined by parameters selected and tag - parameter. If this option is not set, will write - results to current working directory. - -l FILE, --output_filename FILE - Filename to write results to. Can be left blank. If - this flag is left blank the filename will be - automatically generated. If set this flag will - override -o. - -a DIR, --target_directory DIR - Directory that contains either .txt or .tsv input - target psm data. Make sure the directory ONLY contains - result files. - -b DIR, --decoy_directory DIR - Directory that contains either .txt or .tsv input - decoy psm data. Make sure the directory ONLY contains. - result files. - -c DIR, --combined_directory DIR - Directory that contains either .txt or .tsv input data - with targets/decoys combined. Make sure the directory - ONLY contains result files. - -db FILE, --database FILE - Path to the fasta formatted database used in the MS - search. This is optional. If not set, will use the - proteins only in the input files. - -y FILE, --yaml_params FILE - Path to a Protein Inference Yaml Parameter File. If - this is not set, default parameters will be used. - -p, --skip_append_alt - Advanced usage only. If this flag is set, will skip - adding alternative proteins to each PSM from the - database digest. If this flag is not set, the - peptide/protein mapping will be taken from database - digest and appended to the mapping present in the - input files. - -i, --id_splitting Advanced usage only. If set this flag will split - protein identifiers.If not set, this flag will not - split protein identifiers.Sometimes the fasta database - protein IDs are formatted as: 'sp|ARAF_HUMAN|P10398'. - While protein IDs in the input files are formatted as - 'ARAF_HUMAN|P10398'. Setting This flag will split off - the front 'sp|' or 'tr|' from the database protein - identifiers. - -r FILE, --roc_plot_filepath FILE - PDF Filepath to write the ROC plot to after Heuristic - Scoring. If not set, writes the file with filename - roc_plot.pdf to directory set in -o. If -o is not set, - will write the file to current working directory. - -m FLOAT, --fdr_max FLOAT - The maximum FDR to display in the ROC plot. Defaults - to 0.1 if not set. - -u STR, --output_type STR - The type of output to be written. Can either be 'all' - or 'optimal'. If set to 'all' will output all - inference results. If set to 'optimal' will output - only the result selected by the heuristic method. If - left blank this will default to 'all'. -``` - -Input options are the same as the standard protein_inference_cli.py with the addition of three optional inputs: -1) `-r` This is a filepath that will have a pdf plot written to it after the heuristic method has been run. If this is left blank, it will write the plot into the standard output directory with the name roc_plot.pdf -2) `-m` The maximum FDR to display in the ROC plot. If this value is left blank, it will be set to 0.1 -3) `-u` This is the type of output to be written after the heuristic method is complete. Will either output all results or the optimal results. If all is selected, the optimal results will have the string "optimal_method" spliced into the filename. - -You can run the tool as follows: -```shell -protein_inference_heuristic_cli.py \ --t /path/to/target/file.txt \ --d /path/to/decoy/file.txt \ --db /path/to/database/file.fasta \ --y /path/to/parameter/file.yaml \ --o /path/to/output/directory/ \ --r /path/to/pdf/file.pdf \ --m 0.2 -``` - -Running with multiple input target/decoy files: -```shell -protein_inference_heuristic_cli.py \ --t /path/to/target/file1.txt /path/to/target/file2.txt \ --d /path/to/decoy/file1.txt /path/to/decoy/file2.txt \ --db /path/to/database/file.fasta \ --y /path/to/parameter/file.yaml \ --o /path/to/output/directory/ \ --r /path/to/pdf/file.pdf \ --m 0.2 -``` - - -#### Running the Heuristic Method via Python -To run within a python console please see the following example: -```python -from pyproteininference.heuristic import HeuristicPipeline - -yaml_params = "/path/to/yaml/params.yaml" -database = "/path/to/database/file.fasta" -### target_files can either be a list of files or one file -target_files = ["/path/to/target1.txt","/path/to/target2.txt"] -### decoy_files can either be a list of files or one file -decoy_files = ["/path/to/decoy1.txt","/path/to/decoy2.txt"] -output_directory_name = "/path/to/output/directory/" -pdf_filename = "/path/to/output/directory/roc_plot.pdf" - -hp = HeuristicPipeline(parameter_file=yaml_params, - database_file=database, - target_files=target_files, - decoy_files=decoy_files, - combined_files=None, - output_directory=output_directory_name, - roc_plot_filepath=pdf_filename, - fdr_max=0.2, - output_type="all") -# Calling .execute() will initiate the heuristic pipeline with the given data -# The suggested method will be output in the console and the suggested method results will be written into the output_directory -hp.execute() - -# The ROC plot can be ran separately as well with the following: -hp.generate_roc_plot(fdr_max=0.1, pdf_filename=pdf_filename) -# Note, the above method can only be ran after .execute() has been run as well -``` - -#### Heuristic Output Example - -##### Console Output -Console Output is as follows and indicates the recommended method at the end: -```shell -2021-07-22 17:43:05,677 - pyproteininference.heuristic.HeuristicPipeline - INFO - Number of Passing Proteins per Inference Method -2021-07-22 17:43:05,678 - pyproteininference.heuristic.HeuristicPipeline - INFO - {'inclusion': 4956, 'exclusion': 1769, 'parsimony': 2808, 'peptide_centric': 4757} -2021-07-22 17:43:05,679 - pyproteininference.heuristic.HeuristicPipeline - INFO - Initial Heuristic Scores -2021-07-22 17:43:05,679 - pyproteininference.heuristic.HeuristicPipeline - INFO - {'inclusion': 0.5928862224126847, 'exclusion': 0.576152064531587, 'parsimony': 0.2663299076815886, 'peptide_centric': 0.4970103849784959} -2021-07-22 17:43:05,679 - pyproteininference.heuristic.HeuristicPipeline - INFO - Removing inclusion with score 0.5928862224126847 -2021-07-22 17:43:05,680 - pyproteininference.heuristic.HeuristicPipeline - INFO - Final Heuristic Scores -2021-07-22 17:43:05,680 - pyproteininference.heuristic.HeuristicPipeline - INFO - {'exclusion': 0.5323198942498348, 'parsimony': 0.1394422310756972, 'peptide_centric': 1.0786541402665502} -2021-07-22 17:43:05,680 - pyproteininference.heuristic.HeuristicPipeline - INFO - Removing exclusion with score 0.5323198942498348 -2021-07-22 17:43:05,680 - pyproteininference.heuristic.HeuristicPipeline - INFO - Inference peptide_centric Selected with score 1.0786541402665502 -``` - -##### ROC Plot Output -Below is an example of an ROC plot on some data. The plot indicates the number of target proteins identified at a specified decoy FDR for four methods (Inclusion, Exclusion, Parsimony, and Peptide Centric). -The plot also indicates the Target FDR that is set in the parameter file. This Target FDR is also used by the heuristic in order to make decisions about the recommended inference method for the given dataset. - -![images/swissprot_example.png](images/swissprot_example.png) - -## Supplementary Information - -### Heuristic Algorithm Notes -The Heuristic Algorithm contains multiple steps listed below: -1. First each of the four main inference methods is executed. -2. The number of target proteins is identified for each inference method based on the target FDR found in the parameter file. This is usually 1% but can be changed by the user. -3. Similarity measurements are generated between all 4 methods. The similarity measurements are calculated by taking the number of target hits at the specified FDR of each method and dividing this number by the mean of the other 3 methods. This is done for all 4 methods. -4. The least similar method of the four is removed from further analysis. -5. The remaining methods are again checked against one another and step 3 is repeated for the 3 remaining methods -6. Inclusion/Exclusion are checked to see if they pass a certain threshold of similarity (This is empirically set to .2). Users can override this value by setting the value of `empirical_threshold` when running the method `determine_optimal_inference_method()` which is an instance method of the `HeuristicPipeline` Class. __Note__: This must be executed only after `execute()` has been run on the data. -7. Using Empirical evidence a candidate method is selected based on the methods still available. - -### Inference Types - -#### Inclusion Notes - -Inclusion simply maps all peptides to all possible proteins. In this model we allow peptides to map to multiple proteins. -This approach is useful if you want to analyze all possible proteins (even those that do not have any distinguishing peptides). - -![images/inclusion.png](images/inclusion.png) - -#### Parsimony Notes - -Parsimony is the process of taking the list of peptides and mapping them to the minimal set of protein identifiers available. -This is a standard method that is good at limiting the overall number of proteins but still utilizing all selected peptides. -This method allows shared peptides to map to more than one protein. Assuming that the proteins the peptides get mapped to also contain at least one unique peptide across the searches. - -![images/parsimony.png](images/parsimony.png) - -#### Exclusion Notes - -Exclusion maps all peptides to all possible proteins but removes any peptide from the search that is not distinguishing. This means that if a peptide maps to more than one protein it will be removed. With this inference model the database selection is very important. Ideally the database selected for searches should have limited redundancy. -The redundancy is computed on the database level, meaning if a peptide maps to more than one protein that is contained in the Fasta database then that peptide will be removed from the analysis entirely. -Exception: If two or more proteins within the database map to the exact same set of digested peptides the algorithm will select the first listed protein and discard the others. - -![images/exclusion.png](images/exclusion.png) - -#### Peptide Centric Notes - -For Peptide Centric inference all peptides are assigned to all possible proteins. Each peptide is then assigned a protein group based on the mentioned possible protein map. For protein group naming, the possible proteins for the peptides in the group are concatenated to a list separated by a semi-colon. -This method is useful when there are a lot of shared peptides between multiple proteins. This will create new protein groups based on the shared peptides. This can sometimes more accurately represent the biological state. - -![images/peptide_centric.jpeg](images/peptide_centric.jpeg) - -#### First Protein Notes - -For the First Protein inference method each peptide gets assigned to one protein only. The protein that gets assigned to each peptide is the first listed protein. This is typically the first protein listed in the Fasta database file. -This method is useful if you just want a quick way to get a sense of the protein FDR and are not worried as much about the peptide to protein mapping. - -### Parsimony Dependencies -Parsimony currently has potential external dependencies depending on the __lp_solver__ that is selected in the parameter file by default this is 'pulp'. - - 1. For __Pulp__: - Pulp _should_ automatically install the LP Solver CBC by default. - However, For troubleshooting please see [Pulp](https://pypi.org/project/PuLP/) documentation. - If Pulp is not working it is likely due to the linear program [CBC](https://github.com/coin-or/Cbc) not being installed. - Please follow the instructions for installing [CBC](https://github.com/coin-or/Cbc). - 2. For __Glpk__: - Mac: `brew install glpk` - Try to install with [Homebrew](https://brew.sh/). - Windows: Check [here](http://winglpk.sourceforge.net/). - Linux: Check [here](https://en.wikibooks.org/wiki/GLPK/Linux_packages). - Other: Check the main [GLPK website](https://www.gnu.org/software/glpk/). - Anaconda: [Anaconda](https://anaconda.org/conda-forge/glpk) also offers an installation. - -### Protein Picker - -[Protein Picker](https://www.ncbi.nlm.nih.gov/pubmed/25987413) is an algorithm that treats target and decoy proteins as pairs and is essentially target/decoy competition. If both the target and decoy proteins are identified from the searches when protein picker is run, then the target and decoy scores are compared with one another. The one with the better score is kept to continue on in the analysis while the one with the worse score gets filtered out of the analysis. This algorithm is integrated into other tools such as [Percolator Protein Inference](https://www.ncbi.nlm.nih.gov/pubmed/27572102). - -### Protein Score Types - -|Score Type| Description | -|---|---| -| Best Peptide Per Protein | Uses the best scoring PSM as the overall score for a given protein. This can be beneficial to use when proteins of interest map to few peptides. | -| Multiplicative Log | Multiplies all of the PSM scores together and then takes the log of the value (This only works for psm scores where lower is better). | -| Top Two Combined | Takes the two best scoring peptides and applies Multiplicative Log to them to get the protein score. | -| Additive | Adds all of the PSM scores together (This only works for scores where higher is better). | -| Downweighted Multiplicative Log | Multiplicative Log but normalizes by the number of PSMs per protein. This method attempts to normalize small and large proteins. In a typical multiplicative log a larger protein would have an advantage at having a higher score than a small protein. | -| Geometric Mean | Takes the geometric mean of the PSMs per protein. | - -### Export explanations - -It is advised to use the Standard Export Types OR __q_value__ - -#### Standard Export Types - -All standard export types report back protein leads if there is protein sub-setting from the specific tool. (Parsimony can provide protein subsets but in these export types we only choose to report the group lead). -The tables below represent what the output of each export type will look like. Exports are all in CSV format. - -1. __peptide__: This is a standard export type that reports back Protein, Score, Qvalue, and Peptide sequences in a square format. By default, peptide sequences are separated by a space " ". - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | -|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK EGLNVLQY#FISTHGAR FATSDLNDLYR IALASPDMIR IPQESGGTK LIPAGTGYAYHQDR MGAEAIQALLK NTLLHEQWCDLLEENSVDAVK RVDYSGR VADLFEAR VIDIWAAANDR VTAEDVLKPGTADILVPR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK FQMFQLIDIAR QTAQGMDYLHAK SASEPSLHR VFLPNKQR WHGDVAVKILK | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK QTAQGMDYLHAK SASEPSLHR TFFSLAFCDFCLK | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH YCWMSTGLYIPGR | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | - -2. __psms__: This is a standard export type that reports back Protein, Score, Qvalue, and PSMs in a square format. By default, PSMs are separated by a space " ". - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | -|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK EGLNVLQY#FISTHGAR FATSDLNDLYR IALASPDMIR IPQESGGTK LIPAGTGYAYHQDR MGAEAIQALLK NTLLHEQWCDLLEENSVDAVK RVDYSGR VADLFEAR VIDIWAAANDR VTAEDVLKPGTADILVPR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK FQMFQLIDIAR QTAQGMDYLHAK SASEPSLHR VFLPNKQR WHGDVAVKILK | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK QTAQGMDYLHAK SASEPSLHR TFFSLAFCDFCLK | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH YCWMSTGLYIPGR | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | - -3 __psm_ids__: This is a standard export type that reports back Protein, Score, Qvalue, and PSM Identifiers in a square format. By default, PSM IDs are separated by a space " ". Values in Peptides column will be the `PSMid` values from data input. - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | -|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|-------------------------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | 13 14 15 16 17 18 19 20 21 22 23 24 | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | 1 2 3 4 5 8 | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | 3 4 6 7 | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | 10 9 | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | 11 12 | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | 27 | - -4 __long__: This is a standard export type that reports back Protein, Score, Qvalue, and PSM Identifiers in a long format. Each Peptide for each Protein is listed down the file This format is good for merging to exterior data to swap with the inference reference. - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | -|------------------------|--------------------|--------------------|--------------------|-----------------|---------|----------------------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | MGAEAIQALLK | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | NTLLHEQWCDLLEENSVDAVK | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | RVDYSGR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | FATSDLNDLYR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | EGLNVLQY#FISTHGAR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | LIPAGTGYAYHQDR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | VADLFEAR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | IPQESGGTK | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | IALASPDMIR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | VTAEDVLKPGTADILVPR | -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | VIDIWAAANDR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | QTAQGMDYLHAK | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | VFLPNKQR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | FQMFQLIDIAR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | SASEPSLHR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | WHGDVAVKILK | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | QTAQGMDYLHAK | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | TFFSLAFCDFCLK | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | SASEPSLHR | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | YCWMSTGLYIPGR | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | - - -#### Legacy Export Types: - -All Legacy export types report on the peptide level and can either report protein group leads, or it can report all proteins (Proteins that are subsets of another protein will be reported in these cases for Parsimony). - -1. __q_value__: This is similar to the __peptide__ export type other than that the peptide sequences reported will be put into a new column. This causes the resulting file to NOT be square. - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | -|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|---------------------|----------------------------------|--------------|---------------|-----------|----------------|-------------|-----------------------|---------|----------|-------------|--------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK | EGLNVLQY#FISTHGAR | FATSDLNDLYR | IALASPDMIR | IPQESGGTK | LIPAGTGYAYHQDR | MGAEAIQALLK | NTLLHEQWCDLLEENSVDAVK | RVDYSGR | VADLFEAR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | | | | | | | | | | | | - - -2. __q_value_all__: This is similar to __q_value__ except all proteins will be reported (Not just leads). - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | -|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|----------------------------------|----------------------------------|--------------|---------------|-----------|----------------|-------------|-----------------------|---------|----------|-------------|--------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK | EGLNVLQY#FISTHGAR | FATSDLNDLYR | IALASPDMIR | IPQESGGTK | LIPAGTGYAYHQDR | MGAEAIQALLK | NTLLHEQWCDLLEENSVDAVK | RVDYSGR | VADLFEAR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 2 | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | -| BRAF_HUMAN\|P15056 | 35.24577101017814 | 0.0 | 3 | Reviewed | 2 | GYLSPDLSK | QTAQGMDYLHAK | VFLPNKQR | | | | | | | | | | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 3 | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | -| BRAF_HUMAN\|P15056 | 35.24577101017814 | 0.0 | 3 | Reviewed | 3 | GYLSPDLSK | QTAQGMDYLHAK | VFLPNKQR | | | | | | | | | | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | -| B3KX72_HUMAN\|B3KX72 | 15.316094065486292 | 0.0 | 2 | Unreviewed | 5 | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | -| Q96BA7_HUMAN\|Q96BA7 | 6.907755278982137 | 0.0 | 1 | Unreviewed | 5 | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | | | | | | | | | | | | - -3. __q_value_comma_sep__: This is similar to __q_value__ except proteins in the group of a lead will be listed in a separate column denoted "Other_Potential_Identifiers". Peptide identifiers are not shown. - -| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Other_Potential_Identifiers | | -|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|-----------------------------|----------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | | | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | ARAF_HUMAN\|P10398 | BRAF_HUMAN\|P15056 | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | RAF1_HUMAN\|P04049 | BRAF_HUMAN\|P15056 | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | | | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | B3KX72_HUMAN\|B3KX72 | Q96BA7_HUMAN\|Q96BA7 | -| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | | | - -4. __leads__: This is similar to __q_value__ except Q values are not reported and only proteins passing the fdr threshold set in the parameters will be reported. - -| Protein | Score | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | -|---------------------|--------------------|--------------------|-----------------|---------|---------------------|----------------------------------|--------------|---------------|-----------|----------------|-------------|-----------------------|---------|----------|-------------|--------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 12 | Reviewed | {1} | CGVEVTQTK | EGLNVLQY#FISTHGAR | FATSDLNDLYR | IALASPDMIR | IPQESGGTK | LIPAGTGYAYHQDR | MGAEAIQALLK | NTLLHEQWCDLLEENSVDAVK | RVDYSGR | VADLFEAR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 2 | Reviewed | {4} | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 2 | Reviewed | {5} | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | - -5. __all__: This is similar to __q_value_all__ except Q values are not reported and only proteins passing the fdr threshold set in the parameters will be reported. - -| Protein | Score | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | -|----------------------|--------------------|--------------------|-----------------|---------|----------------------------------|----------------------------------|-------------|-----------------------|-------------------|-----------|----------------|-------------|--------------------|------------|---------|----------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 12 | Reviewed | {1} | FATSDLNDLYR | MGAEAIQALLK | IPQESGGTK | NTLLHEQWCDLLEENSVDAVK | EGLNVLQY#FISTHGAR | CGVEVTQTK | LIPAGTGYAYHQDR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | IALASPDMIR | RVDYSGR | VADLFEAR | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | SASEPSLHR | FQMFQLIDIAR | WHGDVAVKILK | QTAQGMDYLHAK | CQTCGYKFHEHCSTK | VFLPNKQR | | | | | | | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | SASEPSLHR | QTAQGMDYLHAK | GYLSPDLSK | TFFSLAFCDFCLK | | | | | | | | | -| BRAF_HUMAN\|P15056 | 35.24577101017814 | 3 | Reviewed | {2, 3} | QTAQGMDYLHAK | GYLSPDLSK | VFLPNKQR | | | | | | | | | | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | SASEPSLHR | QTAQGMDYLHAK | GYLSPDLSK | TFFSLAFCDFCLK | | | | | | | | | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | SASEPSLHR | FQMFQLIDIAR | WHGDVAVKILK | QTAQGMDYLHAK | CQTCGYKFHEHCSTK | VFLPNKQR | | | | | | | -| BRAF_HUMAN\|P15056 | 35.24577101017814 | 3 | Reviewed | {2, 3} | QTAQGMDYLHAK | GYLSPDLSK | VFLPNKQR | | | | | | | | | | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 2 | Reviewed | {4} | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 2 | Reviewed | {5} | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | -| B3KX72_HUMAN\|B3KX72 | 15.316094065486292 | 2 | Unreviewed | {5} | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | -| Q96BA7_HUMAN\|Q96BA7 | 6.907755278982137 | 1 | Unreviewed | {5} | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | | - - -6. __comma_sep__: This is similar to __q_value_comma_sep__ except Q values are not reported and only proteins passing the fdr threshold set in the parameters will be reported. Peptide identifiers are not shown. - -| Protein | Score | Number_of_Peptides | Identifier_Type | GroupID | Other_Potential_Identifiers | | -|---------------------|--------------------|--------------------|-----------------|---------|-----------------------------|----------------------| -| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 12 | Reviewed | {1} | | | -| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | ARAF_HUMAN\|P10398 | BRAF_HUMAN\|P15056 | -| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | RAF1_HUMAN\|P04049 | BRAF_HUMAN\|P15056 | -| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 2 | Reviewed | {4} | | | -| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 2 | Reviewed | {5} | B3KX72_HUMAN\|B3KX72 | Q96BA7_HUMAN\|Q96BA7 | +6. Running with docker + - Either Pull the image from docker hub: + - `docker pull hinklet/pyproteininference:latest` + - Or Build the image with the following command (After having cloned the repository): + - `git clone REPOSITORY_URL` + - `cd pyproteininference` + - `docker build -t pyproteininference:latest .` + - Run the tool, making sure to volume mount in the directory with your input data and parameters. In the case below, that local directory would be `/path/to/local/directory` and the path in the container is `/data` + ```shell + docker run -v /path/to/local/directory/:/data \ + -it hinklet/pyproteininference:latest \ + python /usr/local/bin/protein_inference_cli.py \ + -f /data/input_file.txt \ + -db /data/database.fasta \ + -y /data/parameters.yaml \ + -o /data/ + ``` + +# Documentation +For more information please see the full package documentation (https://thinkle12.github.io/pyproteininference/). diff --git a/docs/advanced.md b/docs/advanced.md new file mode 100644 index 0000000..79db7ca --- /dev/null +++ b/docs/advanced.md @@ -0,0 +1,332 @@ +## Running Py Protein Inference + 1. [__Main Inference Method__](#running-the-main-py-protein-inference-method) + 2. [__Heuristic__](#running-the-heuristic-method) + +### Running the Main Py Protein Inference Method + +#### Running Via Command Line +Upon proper installation of the package, the command line tool should be installed and _should_ be available from any location on the system. +The command line tool can be called as follows: +```shell +protein_inference_cli.py --help +``` +This will return the help prompt for the tool. +If this does not work download `protein_inference_cli.py` from our repository and write the full path to the script while also calling `python`: +```shell +python /path/to/directory/pyproteininference/scripts/protein_inference_cli.py --help +``` + +Command line options are as follows: +``` +cli$ python protein_inference_cli.py --help +usage: protein_inference_cli.py [-h] [-t FILE [FILE ...]] [-d FILE [FILE ...]] + [-f FILE [FILE ...]] [-o DIR] [-l FILE] + [-a DIR] [-b DIR] [-c DIR] [-db FILE] + [-y FILE] [-p] [-i] + +Protein Inference + +optional arguments: + -h, --help show this help message and exit + -t FILE [FILE ...], --target FILE [FILE ...] + Input target psm output from percolator. Can either + input one file or a list of files. + -d FILE [FILE ...], --decoy FILE [FILE ...] + Input decoy psm output from percolator. Can either + input one file or a list of files. + -f FILE [FILE ...], --combined_files FILE [FILE ...] + Input combined psm output from percolator. This should + contain Target and Decoy PSMS. Can either input one + file or a list of files. + -o DIR, --output DIR Result Directory to write to - the name of file will + be determined by parameters selected and tag + parameter. If this option is not set, will write + results to current working directory. + -l FILE, --output_filename FILE + Filename to write results to. Can be left blank. If + this flag is left blank the filename will be + automatically generated. If set this flag will + override -o. + -a DIR, --target_directory DIR + Directory that contains either .txt or .tsv input + target psm data. Make sure the directory ONLY contains + result files. + -b DIR, --decoy_directory DIR + Directory that contains either .txt or .tsv input + decoy psm data. Make sure the directory ONLY contains + result files. + -c DIR, --combined_directory DIR + Directory that contains either .txt or .tsv input data + with targets/decoys combined. Make sure the directory + ONLY contains result files. + -db FILE, --database FILE + Path to the fasta formatted database used in the MS + search. This is optional. If not set, will use the + proteins only in the input files. + -y FILE, --yaml_params FILE + Path to a Protein Inference Yaml Parameter File. If + this is not set, default parameters will be used. + -p, --skip_append_alt + Advanced usage only. If this flag is set, will skip + adding alternative proteins to each PSM from the + database digest. If this flag is not set, the + peptide/protein mapping will be taken from database + digest and appended to the mapping present in the + input files. + -i, --id_splitting Advanced usage only. If set this flag will split + protein identifiers.If not set, this flag will not + split protein identifiers.Sometimes the fasta database + protein IDs are formatted as: 'sp|ARAF_HUMAN|P10398'. + While protein IDs in the input files are formatted as + 'ARAF_HUMAN|P10398'. Setting This flag will split off + the front 'sp|' or 'tr|' from the database protein + identifiers. + +``` + +The following combinations of input are allowed and at least one combination is required: + +1. `-t -d` Path to input target (`-t`) and decoy (`-d`) files. This can be one target and one decoy file or multiple files separated by spaces (" "). See [here](input_format.md#standard-percolator-output) for information on target/decoy input files. +2. `-a -b` Path to input target (`-a`) and decoy (`-b`) directories that contain target and decoy files. This is one directory each and all .txt and .tsv files will be read in as input. +3. `-f` Path to input combined target/decoy (`-f`) files. This can be one file or multiple files separated by spaces (" "). +4. `-c` Path to input combined target/decoy (`-c`) directory that contain combined target/decoy files. This is one directory each and all .txt and .tsv files will be read in as input. + +Any other combinations will result in an Error raised. + +Optional flags + +1. `-db` Path to Fasta Database file. +2. `-y` Path to Protein Inference Yaml Parameter file. (If this is not supplied default parameters will be used). +3. `-o` Path to the output directory, if this is left blank files will be written to the current working directory. +4. `-l` Path to the output filename, if this is left blank a filename will be automatically generated and will be written to directory as set in `-o`. Will override `-o` flag if set. + +Advanced usage flags + +1. `-p` This flag is a True/False on whether to skip appending alternative proteins from the Fasta database digestion. If this flag is left blank, it will not skip appending alternative proteins (recommended). +2. `-i` True/False on whether to split the IDs in the Fasta database file. If this is left blank, it will not split IDs in the Fasta database file (recommended). + +You can run the tool as follows: +```shell +protein_inference_cli.py \ + -t /path/to/target/file.txt \ + -d /path/to/decoy/file.txt \ + -db /path/to/database/file.fasta \ + -y /path/to/parameter/file.yaml \ + -o /path/to/output/directory/ +``` + +Running with multiple input target/decoy files: +```shell +protein_inference_cli.py \ + -t /path/to/target/file1.txt /path/to/target/file2.txt \ + -d /path/to/decoy/file1.txt /path/to/decoy/file2.txt \ + -db /path/to/database/file.fasta \ + -y /path/to/parameter/file.yaml \ + -o /path/to/output/directory/ +``` + + +#### Running Within Python +To run within a python console please see the following example: +```python +from pyproteininference.pipeline import ProteinInferencePipeline + +yaml_params = "/path/to/yaml/params.yaml" +database = "/path/to/database/file.fasta" +### target_files can either be a list of files or one file +target_files = ["/path/to/target1.txt","/path/to/target2.txt"] +### decoy_files can either be a list of files or one file +decoy_files = ["/path/to/decoy1.txt","/path/to/decoy2.txt"] +output_directory_name = "/path/to/output/directory/" + +pipeline = ProteinInferencePipeline(parameter_file=yaml_params, + database_file=database, + target_files=target_files, + decoy_files=decoy_files, + combined_files=None, + output_directory=output_directory_name) +# Calling .execute() will initiate the pipeline with the given data +pipeline.execute() +``` + +### Running the Heuristic Method +Py Protein Inference also has a built-in Heuristic that runs through four inference methods (Inclusion, Exclusion, Parsimony, and Peptide Centric) and selects a recommended method for your given dataset. +By default, all four result files will be written, and the optimal method will be highlighted to the user. +The Heuristic method also outputs a density plot that showcases all the inference methods compared to one another to gain further insight. For more information on the Heuristic Method see the [__Heuristic algorithm__](supplementary.md#heuristic-algorithm) section. + +#### Running the Heuristic Method via the Command Line +```shell +python protein_inference_heuristic_cli.py --help +``` +This will return the help prompt for the tool. +If this does not work download `protein_inference_heuristic_cli.py` from the repository and write the full path to the script while also calling `python`. +```shell +python /path/to/directory/pyproteininference/scripts/protein_inference_heuristic_cli.py --help +``` + +Command line options are as follows: +``` +cli$ python protein_inference_heuristic_cli.py --help +usage: protein_inference_heuristic_cli.py [-h] [-t FILE [FILE ...]] + [-d FILE [FILE ...]] + [-f FILE [FILE ...]] [-o DIR] + [-l FILE] [-a DIR] [-b DIR] [-c DIR] + [-db FILE] [-y FILE] [-p] [-i] + [-r FILE] [-m FLOAT] [-u STR] + +Protein Inference Heuristic + +optional arguments: + -h, --help show this help message and exit + -t FILE [FILE ...], --target FILE [FILE ...] + Input target psm output from percolator. Can either + input one file or a list of files. + -d FILE [FILE ...], --decoy FILE [FILE ...] + Input decoy psm output from percolator. Can either + input one file or a list of files. + -f FILE [FILE ...], --combined_files FILE [FILE ...] + Input combined psm output from percolator. This should + contain Target and Decoy PSMS. Can either input one + file or a list of files. + -o DIR, --output DIR Result Directory to write to - the name of file will + be determined by parameters selected and tag + parameter. If this option is not set, will write + results to current working directory. + -l FILE, --output_filename FILE + Filename to write results to. Can be left blank. If + this flag is left blank the filename will be + automatically generated. If set this flag will + override -o. + -a DIR, --target_directory DIR + Directory that contains either .txt or .tsv input + target psm data. Make sure the directory ONLY contains + result files. + -b DIR, --decoy_directory DIR + Directory that contains either .txt or .tsv input + decoy psm data. Make sure the directory ONLY contains. + result files. + -c DIR, --combined_directory DIR + Directory that contains either .txt or .tsv input data + with targets/decoys combined. Make sure the directory + ONLY contains result files. + -db FILE, --database FILE + Path to the fasta formatted database used in the MS + search. This is optional. If not set, will use the + proteins only in the input files. + -y FILE, --yaml_params FILE + Path to a Protein Inference Yaml Parameter File. If + this is not set, default parameters will be used. + -p, --skip_append_alt + Advanced usage only. If this flag is set, will skip + adding alternative proteins to each PSM from the + database digest. If this flag is not set, the + peptide/protein mapping will be taken from database + digest and appended to the mapping present in the + input files. + -i, --id_splitting Advanced usage only. If set this flag will split + protein identifiers.If not set, this flag will not + split protein identifiers.Sometimes the fasta database + protein IDs are formatted as: 'sp|ARAF_HUMAN|P10398'. + While protein IDs in the input files are formatted as + 'ARAF_HUMAN|P10398'. Setting This flag will split off + the front 'sp|' or 'tr|' from the database protein + identifiers. + -r FILE, --pdf_filename FILE + PDF Filepath to write the Heuristic plot to after + Heuristic Scoring. If not set, writes the file with + filename heuristic_plot.pdf to directory set in -o. If -o is + not set, will write the file to current working + directory. + -m FLOAT, --fdr_threshold FLOAT + The FDR threshold to use in the Heuristic Method. + Defaults to 0.05 if not set. + -u STR, --output_type STR + The type of output to be written. Can either be 'all' + or 'optimal'. If set to 'all' will output all + inference results. If set to 'optimal' will output + only the result selected by the heuristic method. If + left blank this will default to 'all'. +``` + +Input options are the same as the standard protein_inference_cli.py with the addition of three optional inputs: +1. `-r` This is a filepath that will have a density plot written to it after the heuristic method has been run. If this is left blank, it will write the plot into the standard output directory with the name heuristic_plot.pdf +2. `-m` The FDR threshold to use in the Heuristic Method. The method will use values from 0 to the FDR threshold. If this value is left blank, it will be set to 0.05 +3. `-u` This is the type of output to be written after the heuristic method is complete. Will either output all results or the optimal results. If all is selected, the optimal results will have the string "optimal_method" spliced into the filename. + +You can run the tool as follows: +```shell +protein_inference_heuristic_cli.py \ + -t /path/to/target/file.txt \ + -d /path/to/decoy/file.txt \ + -db /path/to/database/file.fasta \ + -y /path/to/parameter/file.yaml \ + -o /path/to/output/directory/ \ + -r /path/to/pdf/file.pdf \ + -m 0.05 +``` + +Running with multiple input target/decoy files: +```shell +protein_inference_heuristic_cli.py \ + -t /path/to/target/file1.txt /path/to/target/file2.txt \ + -d /path/to/decoy/file1.txt /path/to/decoy/file2.txt \ + -db /path/to/database/file.fasta \ + -y /path/to/parameter/file.yaml \ + -o /path/to/output/directory/ \ + -r /path/to/pdf/file.pdf \ + -m 0.05 +``` + + +#### Running the Heuristic Method via Python +To run within a python console please see the following example: +```python +from pyproteininference.heuristic import HeuristicPipeline + +yaml_params = "/path/to/yaml/params.yaml" +database = "/path/to/database/file.fasta" +### target_files can either be a list of files or one file +target_files = ["/path/to/target1.txt","/path/to/target2.txt"] +### decoy_files can either be a list of files or one file +decoy_files = ["/path/to/decoy1.txt","/path/to/decoy2.txt"] +output_directory_name = "/path/to/output/directory/" +pdf_filename = "/path/to/output/directory/heuristic_plot.pdf" + +hp = HeuristicPipeline(parameter_file=yaml_params, + database_file=database, + target_files=target_files, + decoy_files=decoy_files, + combined_files=None, + output_directory=output_directory_name, + pdf_filename=pdf_filename, + output_type="all") +# Calling .execute() will initiate the heuristic pipeline with the given data +# The suggested method will be output in the console and the suggested method results will be written into the output_directory +hp.execute(fdr_threshold=0.05) + +# The optimal inference method and density plot can be generated separately as well with the following to specify thresholds directly: +hp.determine_optimal_inference_method(false_discovery_rate_threshold=0.05, + upper_empirical_threshold=1, + lower_empirical_threshold=.5, + pdf_filename=None) + +``` + +#### Heuristic Output Example + +##### Console Output +Console Output is as follows and indicates the recommended method at the end: +```shell +2022-05-12 17:28:38,413 - pyproteininference.heuristic - INFO - Heuristic Scores +2022-05-12 17:28:38,413 - pyproteininference.heuristic - INFO - {'inclusion': 1.2145313335009247, 'exclusion': 1.053616485888155, 'parsimony': 0.5416878942666304, 'peptide_centric': 0.24465822235367252} +2022-05-12 17:28:38,413 - pyproteininference.heuristic - INFO - Either parsimony 0.5416878942666304 or peptide centric 0.24465822235367252 pass empirical threshold 0.5. Selecting the best method of the two. +2022-05-12 17:28:38,413 - pyproteininference.heuristic - INFO - Method peptide_centric selected with the heuristic algorithm +``` + +##### Heuristic Density Plot Output +Below is an example of a Heuristic Density plot. The plot indicates the distribution of the number of standard deviations +from the mean (of identified proteins at a specified FDR) for each inference method for a range of FDRs from 0 to the false discovery rate threshold (100 fdrs are incrementally selected in the range [0, fdr threshold]) +In general, the closer that the peak of a distribution is to 0 the more likely the associated method is to be selected as the recommended method. +For more information on the specifics of the Heuristic Algorithm see [__Heuristic Algorithm Description__](supplementary.md#heuristic-algorithm-description) + +![density](img/swissprot_example_density.png) diff --git a/images/exclusion.png b/docs/img/exclusion.png similarity index 100% rename from images/exclusion.png rename to docs/img/exclusion.png diff --git a/images/inclusion.png b/docs/img/inclusion.png similarity index 100% rename from images/inclusion.png rename to docs/img/inclusion.png diff --git a/images/parsimony.png b/docs/img/parsimony.png similarity index 100% rename from images/parsimony.png rename to docs/img/parsimony.png diff --git a/images/peptide_centric.jpeg b/docs/img/peptide_centric.jpeg similarity index 100% rename from images/peptide_centric.jpeg rename to docs/img/peptide_centric.jpeg diff --git a/docs/img/swissprot_example_density.png b/docs/img/swissprot_example_density.png new file mode 100644 index 0000000..f082b4b Binary files /dev/null and b/docs/img/swissprot_example_density.png differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..09d9dc2 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,43 @@ +## Introduction + + +For a quick start guide please click [here](quickstart.md). + +Py Protein Inference is a Python package that has the ability to run various protein inference algorithms on tandem mass spectrometry search results. +In addition to performing protein inference, which maps peptides to proteins, this algorithm creates protein scores based on the supplied psms and is able to calculate set based protein level false discovery rates for MS data filtering purposes. +Py Protein Inference typically takes as input the output PSM files from the [Percolator algorithm](https://github.com/percolator/percolator). +However, Py Protein Inference can also take custom tab delimited files as input. Please see the [input formats](input_format.md) section for more information. +As for output, Py Protein Inference generates a user-friendly csv format file that includes the Proteins, Peptides, q-values, and Protein Scores. Please see the [supplementary](supplementary.md#export-types) section for more information on output formats. +Py Protein Infernece also has a heuristic method to help the user select a recommended inference method for a given dataset. Please see the [supplementary](supplementary.md#heuristic-algorithm) section for more information on the heuristic method. + +Py Protein Inference has the ability to run any of the following inference procedures from literature: + +1. [Parsimony](supplementary.md#parsimony) +2. [Exclusion](supplementary.md#exclusion) +3. [Inclusion](supplementary.md#inclusion) +4. [Peptide Centric](supplementary.md#peptide-centric) (Protein Group Level) + +Please see the [__Inference Types__](supplementary.md#inference-types) section for more information on Inference Types. + +In Addition to these inference types Py Protein Inference can also score proteins with a variety of methods: + +1. Best Peptide Per Protein + - Takes the best scoring PSM as the protein score. +2. Multiplicative Log + - Multiplies each PSM score per protein and takes -log10 of the combined score (smaller psm scores must be better i.e. Pep or Q values). +3. Top Two Combined + - Takes the top two best scoring PSMs and multiplies or adds them together based on if the selected psm score is Pep/Q value style or Mascot Ion Score style. +4. Additive + - Adds each PSM score per protein (larger psm scores must be better i.e. Xcorr, Mascot Ion Score, Percolator Score). +5. Downweighted Multiplicative Log + - Multiplicative Log but with a normalization against the number of PSMs. +6. Geometric Mean + - Geometric Mean algorithm scoring. + +Please see the [__Protein Score Types__](supplementary.md#protein-score-types) section for more information on scoring algorithms. + +## Using Py Protein Inference + 1. [Yaml Parameter File](parameters.md#yaml-parameter-file-outline) + 2. [Input PSM files](input_format.md#input-file-examples) (Tab Delimited) + 3. [Fasta Database](input_format.md#fasta-file-example) + 4. [Running Py Protein Inference](advanced.md#running-py-protein-inference) \ No newline at end of file diff --git a/docs/input_format.md b/docs/input_format.md new file mode 100644 index 0000000..0ea92b8 --- /dev/null +++ b/docs/input_format.md @@ -0,0 +1,51 @@ +## Input File Examples +The standard input filetype is the tab delimited output from the percolator algorithm. Please see below for examples of input files: +### Standard Percolator Output + +#### Target Output +| PSMid | score | q-value | posterior_error_prob | peptide | proteinIds | | | | +|---|---|---|---|---|---|---|---|---| +| 1.1 | 7.5 | 0.0048 | 0.0007 | R.NYIQSLTQMPK.M | MK14_HUMAN\|Q16539 | MK14_HUMAN\|Q16539-2 | MK14_HUMAN\|Q16539-3 | | +| 1.2 | 6.2 | 0.0035 | 0.0006 | R.NTVASSSRSM*R.T | FHDC1_HUMAN\|Q9C0D6 | | | | + + +#### Decoy Output +| PSMid | score | q-value | posterior_error_prob | peptide | proteinIds | | | | +|---|---|---|---|---|---|---|---|---| +| 1.1 | 1.3 | 0.18 | 0.27 | R.RSTITSRE.M | decoy_MK14_HUMAN\|Q16539 | decoy_MK14_HUMAN\|Q16539-2 | decoy_MK14_HUMAN\|Q16539-3 | | +| 1.2 | 0.9 | 0.35 | 0.36 | R.KKRKRSRKEM*R.T | decoy_FHDC1_HUMAN\|Q9C0D6 | | | | + +Decoy proteins should have some sort of decoy identifier to distinguish between target and decoy proteins. This is typically "decoy_" or "##". See the decoy symbol parameter option [here](parameters.md#identifiers) for more information. +These can also be combined into one file called "Combined Output". + +With the above standard input one could use __q-value__ or __posterior_error_prob__ as the PSM score. See [Score Section](parameters.md#score) of the parameter file explanation with __multiplicative__ as __psm_score_type__ and any of the multiplicative options for __protein_score__. + +For example standard input files please see any of the following files from the our repository: + +- `tests/data/test_perc_data_target.txt` +- `tests/data/test_perc_data_decoy.txt` + +### Custom Input +| PSMid | custom_score | peptide | proteinIds | | +|---|---|---|---|---| +| 1.1 | 7.5 | R.NYIQSLTQMPK.M | MK14_HUMAN\|Q16539 | MK14_HUMAN\|Q16539-2 | MK14_HUMAN\|Q16539-3 | | +| 1.2 | 6.2 | R.NTVASSSRSM*R.T | FHDC1_HUMAN\|Q9C0D6 | | | + +With the above custom input one could use __custom_score__ as the PSM __psm_score__ with __additive__ as the __psm_score_type__ and __protein_score__. + +For example custom input files please see any of the following files from the our repository: + +- `tests/data/test_perc_data_target_additive.txt` +- `tests/data/test_perc_data_decoy_additive.txt` +- `tests/data/test_perc_data_target_multiplicative.txt` +- `tests/data/test_perc_data_decoy_multiplicative.txt` + +### Fasta File +This package was developed using standard Fasta files from [Uniprot](https://www.uniprot.org/). +Please see an example entry in a Fasta database below: +```text +>sp|Q5QNW6|H2B2F_HUMAN Histone H2B type 2-F OS=Homo sapiens OX=9606 GN=H2BC18 PE=1 SV=3 +MPDPAKSAPAPKKGSKKAVTKVQKKDGKKRKRSRKESYSVYVYKVLKQVHPDTGISSKAM +GIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVT +KYTSSK +``` \ No newline at end of file diff --git a/docs/parameters.md b/docs/parameters.md new file mode 100644 index 0000000..4479606 --- /dev/null +++ b/docs/parameters.md @@ -0,0 +1,104 @@ +# Yaml Parameter File Outline +The Yaml Parameter File is the central location for all configurations for a given Protein Inference run and are summarized below: +Note: These parameters are all optional. Please see the section [Default Parameters](#default-parameters) for more information on defaults. +## General +| Parameter | Description |Type| +|---|---|---| +| export | Export Type can be one of: __peptides__, __psms__, __psm_ids__, __long__, __q_value__, __q_value_all__, __q_value_comma_sep__, __leads__, __all__, __comma_sep__. Suggested types are __peptides__, __psms__, and __psm_ids__ as these produce square output. If there are multiple proteins per group the three mentioned types will report the leads only. Other types report on the peptide level with slightly different formats and whether or not to include leads only or all proteins. See [here](supplementary.md#export-types) for an in-depth explanation of Export Types. | String | +| fdr | False Discovery Rate to be marked as significant. Ex. __0.01__ for 1% FDR. | Numeric | +| picker | __True__/__False__ on whether to run the Protein Picker algorithm. For more info click [here](supplementary.md#protein-picker). | Bool | +| tag | A String tag that will be written into the result files. Ex. __example_tag__. | String | + +## Data Restriction +| Parameter | Description |Type| +|---|---|---| +| pep_restriction | Posterior Error Probability values to filter (i.e. __0.9__). In this case PSMs with PEP values greater than __0.9__ would be removed from the input. If PEP values are not in input please use __None__. | Numeric | +| peptide_length_restriction | Peptide Length to filter on. (i.e. __7__). If no filter is wanted please use __None__. | Int | +| q_value_restriction | Q Values to filter. (i.e. __0.2__). In this case PSMs with Q Values greater than __0.2__ would be removed from the input. If Q Values are not in input please use __None__ . | Numeric | +| custom_restriction | Custom Value to filter. (i.e. __5__). In this case PSMs with Custom value greater than / less than __5__ would be removed from the input. If Not using a custom score please use __None__. __NOTE__: If a higher score is "better" for your score please set __psm_score_type__ to __additive__. If a lower score is "better" please set __psm_score_type__ parameter to __multiplicative__. | Numeric | + +## Score +| Parameter | Description |Type| +|---|---|---| +| protein_score | One of any of the following: __multiplicative_log__, __best_peptide_per_protein__, __top_two_combined__, __additive__, __iterative_downweighted_log__, __downweighted_multiplicative_log__, __geometric_mean__. Recommended: __multiplicative_log__. | String | +| psm_score | PSM score to use for Protein Scoring. If using Percolator output as input this would either be __posterior_error_prob__ or __q-value__. The string typed here should match the column in your input files __EXACTLY__. If using a custom score it will be filtered accordingly with the value in [__custom_restriction__](#data-restriction). | String | +| psm_score_type | The Type of score that __psm_score__ parameter is. Either __multiplicative__ or __additive__. If a larger psm score is "better" than input additive (i.e. Mascot Ion Score, Xcorr, Percolator Score). If a smaller psm score is "better" than input multiplicative (i.e. Q Value, Posterior Error Probability). See [below](#extra-score-information) for more information.| String | + +#### Extra Score information + 1. The __protein_score__, __psm_score__, and __psm_score_type__ methods must be compatible. + 2. If using a PSM score (__psm_score__ parameter) where the lower the score the better (i.e. __posterior_error_prob__ or __q-value__) then any __protein_score__ can be used except __additive__. __psm_score_type__ must also be set to __multiplicative__. + 3. If using a PSM score (__psm_score__ parameter) where the higher the score the better (i.e. Percolator Score, Mascot Ion Score, Xcorr) (Percolator Score is called __psm_score__ - column name) in the tab delimited percolator output. Then __protein_score__ and __psm_score_type__ must both be __additive__. + +## Identifiers +| Parameter | Description |Type| +|---|---|---| +| decoy_symbol | Symbol within Decoy Identifiers to distinguish between targets. (i.e "__##__" or "__decoy___"). This is important for [Protein Picker](supplementary.md#protein-picker) and FDR calculation. | String | +| isoform_symbol | Symbol that is present in isoform proteins only. (i.e. "__-__"). See [below](#extra-identifier-information) for more information. | String | +| reviewed_identifier_symbol | Identifier to determine a reviewed vs unreviewed identifier. (i.e. "__sp\|__"). See [below](#extra-identifier-information) for more information. | String | + +#### Extra Identifier information + 1. For the __decoy_symbol__ an example of a target protein -> __ex|protein__ and its decoy counterpart could be any of the following: __##ex|##protein__, __##ex|protein__, __decoy_ex|protein__. The decoy symbol just needs to be present within the string to be determined a decoy. + 2. For __isoform_symbol__ and __reviewed_identifier_symbol__, these are used to assign priority in certain algorithms such as parsimony. For example, if we have canonical proteins, isoform proteins, and reviewed/unreviewed proteins in a given analysis; the priority would be established as follows: Reviewed Canonical, Reviewed Isoform, Unreviewed. This means that if two proteins map to the same peptides, the algorithm has to make a decision on which to pick. It would use the previous mentioned priority to pick the protein lead to report. + +## Inference +| Parameter | Description |Type| +|---|---|---| +| inference_type | The Inference procedure to apply to the analysis. This can be __parsimony__, __inclusion__, __exclusion__, __peptide_centric__, or __first_protein__. Please see [here](supplementary.md#inference-types) for more information on the inference types. | String | +| grouping_type | How to group proteins for a given __inference_type__. This can be __subset_peptides__, __shared_peptides__, or __None__. Typically __subset_peptides__ is used. This parameter only effects grouped proteins and has no impact on protein leads. | String | + +## Digest +| Parameter | Description |Type| +|---|---|---| +| digest_type | The enzyme used for digestion for the MS searches. (i.e. __trypsin__). For reference, the database digestion is handled with pyteomics. Can be any expasy rule as defined [here](https://pyteomics.readthedocs.io/en/latest/_modules/pyteomics/parser.html) other common examples include: __trypsin__, __chymotrypsin high specificity__, __chymotrypsin low specificity__, __lysc__. | String | +| missed_cleavages | The number of missed cleavages allowed for the MS searches. (i.e. __2__) | Int | + +## Parsimony +These parameters are only used if __parsimony__ is selected as __inference_type__. + +| Parameter | Description |Type| +|---|---|---| +| lp_solver | This can be one of: __pulp__ or __None__. This determines which linear program solver is used. Please see [here](supplementary.md#parsimony-dependencies) for more information on lp solvers. Input __None__ if not running __parsimony__. If running __parsimony__ this needs to be set to __pulp__. | String | +| shared_peptides | How to assign shared peptides for parsimony. Can be one of: __all__ or __best__. __all__ assigns shared peptides to all possible proteins in the output. __best__ assigns shared peptides to the best scoring protein which is a "winner take all" approach. This is specific to the Parsimony Inference type. | String | + + +## Peptide Centric +These parameters are only used if __peptide_centric__ is selected as __inference_type__. + +| Parameter | Description | Type | +|---|---|---| +| max_identifiers | The maximum number of proteins a peptide is allowed to map to. (i.e. __5__). This serves to limit the number of protein groups that can be created due to highly homologous peptides. | Int | + + +## Default Parameters +```yaml +parameters: + general: + export: peptides + fdr: 0.01 + picker: True + tag: py_protein_inference + data_restriction: + pep_restriction: 0.9 + peptide_length_restriction: 7 + q_value_restriction: 0.005 + custom_restriction: None + score: + protein_score: multiplicative_log + psm_score: posterior_error_prob + psm_score_type: multiplicative + identifiers: + decoy_symbol: "##" + isoform_symbol: "-" + reviewed_identifier_symbol: "sp|" + inference: + inference_type: peptide_centric + grouping_type: shared_peptides + digest: + digest_type: trypsin + missed_cleavages: 3 + parsimony: + lp_solver: pulp + shared_peptides: all + peptide_centric: + max_identifiers: 5 +``` diff --git a/docs/pyproteininference_reference.md b/docs/pyproteininference_reference.md new file mode 100644 index 0000000..9a05b35 --- /dev/null +++ b/docs/pyproteininference_reference.md @@ -0,0 +1,4 @@ +# Py Protein Inference Module + +::: pyproteininference + handler: python \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 0000000..7063a50 --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,65 @@ +# Requirements + +1. __Python 3.7__ or greater. This package was created using __Python 3.7__ +2. __Python Packages__: + __numpy__, __pyteomics__, __pulp__, __PyYAML__, __matplotlib__. These should be installed automatically during installation. + +# Quick Start Guide +1. Install the package using pip: + + pip install pyproteininference + +2. Run the Heuristic method with tab delimited results directly from percolator to generate results for peptide centric, parsimony, inclusion, and exclusion: + + protein_inference_heuristic_cli.py \ + -t /path/to/target/file1.txt \ + -d /path/to/decoy/file1.txt \ + -db /path/to/database/file.fasta + +3. Run the standard commandline tool with tab delimited results directly from percolator to run a particular inference method. By default, peptide centric inference is selected if a parameter file is not specified: + + protein_inference_cli.py \ + -t /path/to/target/file.txt \ + -d /path/to/decoy/file.txt \ + -db /path/to/database/file.fasta + +4. Specifying Parameters. +The two most common parameters to change are the inference type, and the decoy symbol (for identifying decoy proteins vs target proteins). +The parameters can be quickly altered by creating a file called params.yaml as follows: + + parameters: + inference: + inference_type: parsimony + identifiers: + decoy_symbol: "decoy_" + + The inference type can be one of: `parsimony`, `peptide_centric`, `inclusion`, `exclusion`, or `first_protein`. + All parameters are optional, so you only need to define the ones you want to alter. Parameters that are not defined are set to default values. + See [here](parameters.md#default-parameters) for the default parameters. + +5. Run the standard commandline tool again, this time specifying the parameters as above: + + protein_inference_cli.py \ + -t /path/to/target/file.txt \ + -d /path/to/decoy/file.txt \ + -db /path/to/database/file.fasta \ + -y /path/to/params.yaml + +6. Running with docker + + - Either Pull the image from docker hub: + - `docker pull hinklet/pyproteininference:latest` + - Or Build the image with the following command (After having cloned the repository): + - `git clone REPOSITORY_URL` + - `cd pyproteininference` + - `docker build -t pyproteininference:latest .` + - Run the tool, making sure to volume mount in the directory with your input data and parameters. In the case below, that local directory would be `/path/to/local/directory` and the path in the container is `/data` + + docker run -v /path/to/local/directory/:/data \ + -it hinklet/pyproteininference:latest \ + python /usr/local/bin/protein_inference_cli.py \ + -f /data/input_file.txt \ + -db /data/database.fasta \ + -y /data/parameters.yaml \ + -o /data/ + \ No newline at end of file diff --git a/docs/supplementary.md b/docs/supplementary.md new file mode 100644 index 0000000..473f7a6 --- /dev/null +++ b/docs/supplementary.md @@ -0,0 +1,229 @@ +## Supplementary Information + +### Heuristic Algorithm + +The Heuristic Algorithm contains multiple steps listed below: + +The heuristic is a conservative algorithm that attempts to select a method that is not over or under reporting protein IDs based on all four inference methods executed. +Empirical thresholds are set for Parsimony and Peptide Centric (lower threshold) as well as for Inclusion and Exclusion (upper threshold) which helps guide the decision making process for selecting a recommended method. These thresholds are separate for Inclusion and Exclusion because thess two algorithms typically over report Protein IDs (Inclusion) or under report Protein IDs (Exclusion). + +1. First each of the four main inference methods is executed. +2. A finite number (100) of false discovery rates are genereted within the range `[0, fdr_threshold]` by increments of `fdr_threshold * 0.01`. `fdr_threshold` is typically set to `0.05`. +3. We loop over each FDR and the following is done at each specified FDR: +4. The mean of the number of proteins identified at each FDR is calculated across all four methods. +5. The number of standard deviations each method is from the mean at each FDR is calculated. +6. The distribution of the number of standard deviations from the mean is plotted for each method on the same graph. See [Heuristic Plot](advanced.md#heuristic-density-plot-output) for an example. +7. The peak of each distribution (each inference method) is identified and the absolute value of the x-axis coordinate is taken of each value. These are the heuristic scores. +8. A recommended method is selected using the following conditions + - If the heuristic score for parsimony and peptide-centric is within 0.5 stdev of zero then both methods are recommended. If only one of the two methods is within 0.5 stdev of zero then that method is selected. If criteria is not met, continue to the next step. + - If the heuristic score for inclusion and exclusion is within 1 stdev of zero then both methods are recommended. If only one of the two methods is within 1 stdev of zero then that method is selected. (This step is only applicable if step 1 criteria is not met). + - If neither of these conditions above are met then the method with the heuristic score closest to zero (The least amount of stdev from the mean) is selected as the recommended method. + +### Inference Types + +#### Inclusion + +Inclusion simply maps all peptides to all possible proteins. In this model we allow peptides to map to multiple proteins. +This approach is useful if you want to analyze all possible proteins (even those that do not have any distinguishing peptides). + +![img/inclusion.png](img/inclusion.png) + +#### Parsimony + +Parsimony is the process of taking the list of peptides and mapping them to the minimal set of protein identifiers available. +This is a standard method that is good at limiting the overall number of proteins but still utilizing all selected peptides. +This method allows shared peptides to map to more than one protein. Assuming that the proteins the peptides get mapped to also contain at least one unique peptide across the searches. + +![img/parsimony.png](img/parsimony.png) + +#### Exclusion + +Exclusion maps all peptides to all possible proteins but removes any peptide from the search that is not distinguishing. This means that if a peptide maps to more than one protein it will be removed. With this inference model the database selection is very important. Ideally the database selected for searches should have limited redundancy. +The redundancy is computed on the database level, meaning if a peptide maps to more than one protein that is contained in the Fasta database then that peptide will be removed from the analysis entirely. +Exception: If two or more proteins within the database map to the exact same set of digested peptides the algorithm will select the first listed protein and discard the others. + +![img/exclusion.png](img/exclusion.png) + +#### Peptide Centric + +For Peptide Centric inference all peptides are assigned to all possible proteins. Each peptide is then assigned a protein group based on the mentioned possible protein map. For protein group naming, the possible proteins for the peptides in the group are concatenated to a list separated by a semi-colon. +This method is useful when there are a lot of shared peptides between multiple proteins. This will create new protein groups based on the shared peptides. This can sometimes more accurately represent the biological state. + +![img/peptide_centric.jpeg](img/peptide_centric.jpeg) + +#### First Protein + +For the First Protein inference method each peptide gets assigned to one protein only. The protein that gets assigned to each peptide is the first listed protein. This is typically the first protein listed in the Fasta database file. +This method is very simplistic but useful if you just want a quick way to get a sense of the protein FDR and are not worried as much about the peptide to protein mapping. + + +### Protein Picker + +[Protein Picker](https://www.ncbi.nlm.nih.gov/pubmed/25987413) is an algorithm that treats target and decoy proteins as pairs and is essentially target/decoy competition. If both the target and decoy proteins are identified from the searches when protein picker is run, then the target and decoy scores are compared with one another. The one with the better score is kept to continue on in the analysis while the one with the worse score gets filtered out of the analysis. This algorithm is integrated into other tools such as [Percolator Protein Inference](https://www.ncbi.nlm.nih.gov/pubmed/27572102). + +### Protein Score Types + +|Score Type| Description | +|---|---| +| Best Peptide Per Protein | Uses the best scoring PSM as the overall score for a given protein. This can be beneficial to use when proteins of interest map to few peptides. | +| Multiplicative Log | Multiplies all of the PSM scores together and then takes the log of the value (This only works for psm scores where lower is better). | +| Top Two Combined | Takes the two best scoring peptides and applies Multiplicative Log to them to get the protein score. | +| Additive | Adds all of the PSM scores together (This only works for scores where higher is better). | +| Downweighted Multiplicative Log | Multiplicative Log but normalizes by the number of PSMs per protein. This method attempts to normalize small and large proteins. In a typical multiplicative log a larger protein would have an advantage at having a higher score than a small protein. | +| Geometric Mean | Takes the geometric mean of the PSMs per protein. | + +### Export Types + +It is advised to use the Standard Export Types OR __q_value__ + +#### Standard Export Types + +All standard export types report back protein leads if there is protein sub-setting from the specific tool. (Parsimony can provide protein subsets but in these export types we only choose to report the group lead). +The tables below represent what the output of each export type will look like. Exports are all in CSV format. + +- __peptide__: This is a standard export type that reports back Protein, Score, Qvalue, and Peptide sequences in a square format. By default, peptide sequences are separated by a space " ". + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | +|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK EGLNVLQY#FISTHGAR FATSDLNDLYR IALASPDMIR IPQESGGTK LIPAGTGYAYHQDR MGAEAIQALLK NTLLHEQWCDLLEENSVDAVK RVDYSGR VADLFEAR VIDIWAAANDR VTAEDVLKPGTADILVPR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK FQMFQLIDIAR QTAQGMDYLHAK SASEPSLHR VFLPNKQR WHGDVAVKILK | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK QTAQGMDYLHAK SASEPSLHR TFFSLAFCDFCLK | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH YCWMSTGLYIPGR | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | + +- __psms__: This is a standard export type that reports back Protein, Score, Qvalue, and PSMs in a square format. By default, PSMs are separated by a space " ". + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | +|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK EGLNVLQY#FISTHGAR FATSDLNDLYR IALASPDMIR IPQESGGTK LIPAGTGYAYHQDR MGAEAIQALLK NTLLHEQWCDLLEENSVDAVK RVDYSGR VADLFEAR VIDIWAAANDR VTAEDVLKPGTADILVPR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK FQMFQLIDIAR QTAQGMDYLHAK SASEPSLHR VFLPNKQR WHGDVAVKILK | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK QTAQGMDYLHAK SASEPSLHR TFFSLAFCDFCLK | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH YCWMSTGLYIPGR | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | + +- __psm_ids__: This is a standard export type that reports back Protein, Score, Qvalue, and PSM Identifiers in a square format. By default, PSM IDs are separated by a space " ". Values in Peptides column will be the `PSMid` values from data input. + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | +|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|-------------------------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | 13 14 15 16 17 18 19 20 21 22 23 24 | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | 1 2 3 4 5 8 | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | 3 4 6 7 | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | 10 9 | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | 11 12 | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | 27 | + +- __long__: This is a standard export type that reports back Protein, Score, Qvalue, and PSM Identifiers in a long format. Each Peptide for each Protein is listed down the file This format is good for merging to exterior data to swap with the inference reference. + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | +|------------------------|--------------------|--------------------|--------------------|-----------------|---------|----------------------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | MGAEAIQALLK | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | NTLLHEQWCDLLEENSVDAVK | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | RVDYSGR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | FATSDLNDLYR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | EGLNVLQY#FISTHGAR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | LIPAGTGYAYHQDR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | VADLFEAR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | IPQESGGTK | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | IALASPDMIR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | VTAEDVLKPGTADILVPR | +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | VIDIWAAANDR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | QTAQGMDYLHAK | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | VFLPNKQR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | FQMFQLIDIAR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | SASEPSLHR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | WHGDVAVKILK | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | QTAQGMDYLHAK | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | TFFSLAFCDFCLK | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | SASEPSLHR | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | YCWMSTGLYIPGR | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | + + +#### Legacy Export Types: + +All Legacy export types report on the peptide level and can either report protein group leads, or it can report all proteins (Proteins that are subsets of another protein will be reported in these cases for Parsimony). + +- __q_value__: This is similar to the __peptide__ export type other than that the peptide sequences reported will be put into a new column. This causes the resulting file to NOT be square. + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | +|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|---------------------|----------------------------------|--------------|---------------|-----------|----------------|-------------|-----------------------|---------|----------|-------------|--------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK | EGLNVLQY#FISTHGAR | FATSDLNDLYR | IALASPDMIR | IPQESGGTK | LIPAGTGYAYHQDR | MGAEAIQALLK | NTLLHEQWCDLLEENSVDAVK | RVDYSGR | VADLFEAR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | | | | | | | | | | | | + + +- __q_value_all__: This is similar to __q_value__ except all proteins will be reported (Not just leads). + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | +|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|----------------------------------|----------------------------------|--------------|---------------|-----------|----------------|-------------|-----------------------|---------|----------|-------------|--------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | CGVEVTQTK | EGLNVLQY#FISTHGAR | FATSDLNDLYR | IALASPDMIR | IPQESGGTK | LIPAGTGYAYHQDR | MGAEAIQALLK | NTLLHEQWCDLLEENSVDAVK | RVDYSGR | VADLFEAR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 2 | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | +| BRAF_HUMAN\|P15056 | 35.24577101017814 | 0.0 | 3 | Reviewed | 2 | GYLSPDLSK | QTAQGMDYLHAK | VFLPNKQR | | | | | | | | | | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 3 | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | +| BRAF_HUMAN\|P15056 | 35.24577101017814 | 0.0 | 3 | Reviewed | 3 | GYLSPDLSK | QTAQGMDYLHAK | VFLPNKQR | | | | | | | | | | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | +| B3KX72_HUMAN\|B3KX72 | 15.316094065486292 | 0.0 | 2 | Unreviewed | 5 | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | +| Q96BA7_HUMAN\|Q96BA7 | 6.907755278982137 | 0.0 | 1 | Unreviewed | 5 | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | MEPTPVPFCGAK | | | | | | | | | | | | + +- __q_value_comma_sep__: This is similar to __q_value__ except proteins in the group of a lead will be listed in a separate column denoted "Other_Potential_Identifiers". Peptide identifiers are not shown. + +| Protein | Score | Q_Value | Number_of_Peptides | Identifier_Type | GroupID | Other_Potential_Identifiers | | +|-------------------------|--------------------|--------------------|--------------------|-----------------|---------|-----------------------------|----------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 0.0 | 12 | Reviewed | 1 | | | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 0.0 | 6 | Reviewed | 2 | ARAF_HUMAN\|P10398 | BRAF_HUMAN\|P15056 | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 0.0 | 4 | Reviewed | 3 | RAF1_HUMAN\|P04049 | BRAF_HUMAN\|P15056 | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 0.0 | 2 | Reviewed | 4 | | | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 0.0 | 2 | Reviewed | 5 | B3KX72_HUMAN\|B3KX72 | Q96BA7_HUMAN\|Q96BA7 | +| ##TCAF2_HUMAN\|##A6NFQ2 | 2.4079456086518722 | 0.3333333333333333 | 1 | Reviewed | 6 | | | + +- __leads__: This is similar to __q_value__ except Q values are not reported and only proteins passing the fdr threshold set in the parameters will be reported. + +| Protein | Score | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | +|---------------------|--------------------|--------------------|-----------------|---------|---------------------|----------------------------------|--------------|---------------|-----------|----------------|-------------|-----------------------|---------|----------|-------------|--------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 12 | Reviewed | {1} | CGVEVTQTK | EGLNVLQY#FISTHGAR | FATSDLNDLYR | IALASPDMIR | IPQESGGTK | LIPAGTGYAYHQDR | MGAEAIQALLK | NTLLHEQWCDLLEENSVDAVK | RVDYSGR | VADLFEAR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | CQTCGYKFHEHCSTK | FQMFQLIDIAR | QTAQGMDYLHAK | SASEPSLHR | VFLPNKQR | WHGDVAVKILK | | | | | | | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | GYLSPDLSK | QTAQGMDYLHAK | SASEPSLHR | TFFSLAFCDFCLK | | | | | | | | | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 2 | Reviewed | {4} | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 2 | Reviewed | {5} | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | + +- __all__: This is similar to __q_value_all__ except Q values are not reported and only proteins passing the fdr threshold set in the parameters will be reported. + +| Protein | Score | Number_of_Peptides | Identifier_Type | GroupID | Peptides | | | | | | | | | | | | +|----------------------|--------------------|--------------------|-----------------|---------|----------------------------------|----------------------------------|-------------|-----------------------|-------------------|-----------|----------------|-------------|--------------------|------------|---------|----------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 12 | Reviewed | {1} | FATSDLNDLYR | MGAEAIQALLK | IPQESGGTK | NTLLHEQWCDLLEENSVDAVK | EGLNVLQY#FISTHGAR | CGVEVTQTK | LIPAGTGYAYHQDR | VIDIWAAANDR | VTAEDVLKPGTADILVPR | IALASPDMIR | RVDYSGR | VADLFEAR | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | SASEPSLHR | FQMFQLIDIAR | WHGDVAVKILK | QTAQGMDYLHAK | CQTCGYKFHEHCSTK | VFLPNKQR | | | | | | | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | SASEPSLHR | QTAQGMDYLHAK | GYLSPDLSK | TFFSLAFCDFCLK | | | | | | | | | +| BRAF_HUMAN\|P15056 | 35.24577101017814 | 3 | Reviewed | {2, 3} | QTAQGMDYLHAK | GYLSPDLSK | VFLPNKQR | | | | | | | | | | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | SASEPSLHR | QTAQGMDYLHAK | GYLSPDLSK | TFFSLAFCDFCLK | | | | | | | | | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | SASEPSLHR | FQMFQLIDIAR | WHGDVAVKILK | QTAQGMDYLHAK | CQTCGYKFHEHCSTK | VFLPNKQR | | | | | | | +| BRAF_HUMAN\|P15056 | 35.24577101017814 | 3 | Reviewed | {2, 3} | QTAQGMDYLHAK | GYLSPDLSK | VFLPNKQR | | | | | | | | | | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 2 | Reviewed | {4} | LYLLTQMPH | YCWMSTGLYIPGR | | | | | | | | | | | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 2 | Reviewed | {5} | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | +| B3KX72_HUMAN\|B3KX72 | 15.316094065486292 | 2 | Unreviewed | {5} | AEGGGGGGRPGAPAAGDGK | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | +| Q96BA7_HUMAN\|Q96BA7 | 6.907755278982137 | 1 | Unreviewed | {5} | LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR | | | | | | | | | | | | + + +- __comma_sep__: This is similar to __q_value_comma_sep__ except Q values are not reported and only proteins passing the fdr threshold set in the parameters will be reported. Peptide identifiers are not shown. + +| Protein | Score | Number_of_Peptides | Identifier_Type | GroupID | Other_Potential_Identifiers | | +|---------------------|--------------------|--------------------|-----------------|---------|-----------------------------|----------------------| +| RPOC_SHIF8\|Q0SY12 | 82.89306334778564 | 12 | Reviewed | {1} | | | +| RAF1_HUMAN\|P04049 | 70.7434325345954 | 6 | Reviewed | {2, 3} | ARAF_HUMAN\|P10398 | BRAF_HUMAN\|P15056 | +| ARAF_HUMAN\|P10398 | 46.288402190472596 | 4 | Reviewed | {2, 3} | RAF1_HUMAN\|P04049 | BRAF_HUMAN\|P15056 | +| TCAF1_HUMAN\|Q9Y4C2 | 19.048939464610452 | 2 | Reviewed | {4} | | | +| HNRPU_HUMAN\|Q00839 | 15.316094065486292 | 2 | Reviewed | {5} | B3KX72_HUMAN\|B3KX72 | Q96BA7_HUMAN\|Q96BA7 | diff --git a/images/swissprot_example.png b/images/swissprot_example.png deleted file mode 100644 index f916586..0000000 Binary files a/images/swissprot_example.png and /dev/null differ diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..9e256ed --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,21 @@ +site_name: Py Protein Inference +nav: + - Home: index.md + - Quick Start: quickstart.md + - Parameters: parameters.md + - Input Formats: input_format.md + - Advanced Usage: advanced.md + - Supplementary Information: supplementary.md + - Code Reference: + - PyProteinInference: pyproteininference_reference.md +theme: readthedocs +plugins: + - search + - mkdocstrings: + default_handler: python + handlers: + python: + rendering: + show_source: true + watch: + - pyproteininference \ No newline at end of file diff --git a/parameters/Protein_Inference_Params.yaml b/parameters/Protein_Inference_Params.yaml index ffce0ec..296a62d 100644 --- a/parameters/Protein_Inference_Params.yaml +++ b/parameters/Protein_Inference_Params.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: pulp - glpk_path: None shared_peptides: all peptide_centric: max_identifiers: 5 diff --git a/pyproteininference/datastore.py b/pyproteininference/datastore.py index ac6e275..263dabd 100644 --- a/pyproteininference/datastore.py +++ b/pyproteininference/datastore.py @@ -23,42 +23,41 @@ class DataStore(object): Attributes: - main_data_form (list): List of unrestricted Psm objects - parameter_file_object (pyproteininference.parameters.ProteinInferenceParameter): protein inference parameter - object - restricted_peptides (list): List of non flaking peptide strings present in the current analysis - main_data_restricted (list): List of restricted :py:class:`pyproteininference.physical.Psm` objects. + main_data_form (list): List of unrestricted Psm objects. + parameter_file_object (ProteinInferenceParameter): protein inference parameter + [object][pyproteininference.parameters.ProteinInferenceParameter]. + restricted_peptides (list): List of non flaking peptide strings present in the current analysis. + main_data_restricted (list): List of restricted [Psm][pyproteininference.physical.Psm] objects. Restriction is based on the parameter_file_object and the object is created by function - :py:meth:`pyproteininference.datastore.DataStore.restrict_psm_data` - scored_proteins (list): List of scored :py:class:`pyproteininference.physical.Protein` objects. - Output from scoring methods from :py:mod:`pyproteininference.scoring` - grouped_scored_proteins (list): List of scored :py:class:`pyproteininference.physical.Protein` + [restrict_psm_data][pyproteininference.datastore.DataStore.restrict_psm_data]. + scored_proteins (list): List of scored [Protein][pyproteininference.physical.Protein] objects. + Output from scoring methods from [scoring][pyproteininference.scoring]. + grouped_scored_proteins (list): List of scored [Protein][pyproteininference.physical.Protein] objects that have been grouped and sorted. Output from - :py:meth:`pyproteininference.inference.Inference.run_inference` method - scoring_input (list): List of non-scored :py:class:`pyproteininference.physical.Protein` objects. - Output from :py:meth:`pyproteininference.datastore.DataStore.create_scoring_input` - picked_proteins_scored (list): List of :py:class:`pyproteininference.physical.Protein` objects that pass - the protein picker algorithm (:py:meth:`pyproteininference.datastore.DataStore.protein_picker`) - picked_proteins_removed (list): List of :py:class:`pyproteininference.physical.Protein` objects that do not - pass the protein picker algorithm (:py:meth:`pyproteininference.datastore.DataStore.protein_picker`) + [run_inference][pyproteininference.inference.Inference.run_inference] method. + scoring_input (list): List of non-scored [Protein][pyproteininference.physical.Protein] objects. + Output from [create_scoring_input][pyproteininference.datastore.DataStore.create_scoring_input]. + picked_proteins_scored (list): List of [Protein][pyproteininference.physical.Protein] objects that pass + the protein picker algorithm ([protein_picker][pyproteininference.datastore.DataStore.protein_picker]). + picked_proteins_removed (list): List of [Protein][pyproteininference.physical.Protein] objects that do not + pass the protein picker algorithm ([protein_picker][pyproteininference.datastore.DataStore.protein_picker]). protein_peptide_dictionary (collections.defaultdict): Dictionary of protein strings (keys) that map to sets - of peptide strings based on the peptides and proteins found in the search. Protein -> set(Peptides) + of peptide strings based on the peptides and proteins found in the search. Protein -> set(Peptides). peptide_protein_dictionary (collections.defaultdict): Dictionary of peptide strings (keys) that map to sets - of protein strings based on the peptides and proteins found in the search. Peptide -> set(Proteins) + of protein strings based on the peptides and proteins found in the search. Peptide -> set(Proteins). high_low_better (str): Variable that indicates whether a higher or a lower protein score is better. - This is necessary to sort Protein objects by score properly. Can either be "higher" or "lower" - psm_score (str): Variable that indicates the :py:class:`pyproteininference.physical.Psm` - score being used in the analysis to generate :py:class:`pyproteininference.physical.Protein` scores - protein_score (str): String to indicate the protein score method used - short_protein_score (str): Short String to indicate the protein score method used - protein_group_objects (list): List of scored :py:class:`pyproteininference.physical.ProteinGroup` + This is necessary to sort Protein objects by score properly. Can either be "higher" or "lower". + psm_score (str): Variable that indicates the [Psm][pyproteininference.physical.Psm] + score being used in the analysis to generate [Protein][pyproteininference.physical.Protein] scores. + protein_score (str): String to indicate the protein score method used. + short_protein_score (str): Short String to indicate the protein score method used. + protein_group_objects (list): List of scored [ProteinGroup][pyproteininference.physical.ProteinGroup] objects that have been grouped and sorted. Output from - :py:meth:`pyproteininference.inference.Inference.run_inference` method - decoy_symbol (str): String that is used to differentiate between decoy proteins and target proteins. Ex: "##" - digest (pyproteininference.in_silico_digest.Digest): Digest object - :py:class:`pyproteininference.in_silico_digest.Digest` - SCORE_MAPPER (dict): Dictionary that maps potential scores in input files to internal score names - CUSTOM_SCORE_KEY (str): String that indicates a custom score is being used + [run_inference][pyproteininference.inference.Inference.run_inference] method. + decoy_symbol (str): String that is used to differentiate between decoy proteins and target proteins. Ex: "##". + digest (Digest): [Digest object][pyproteininference.in_silico_digest.Digest]. + SCORE_MAPPER (dict): Dictionary that maps potential scores in input files to internal score names. + CUSTOM_SCORE_KEY (str): String that indicates a custom score is being used. """ @@ -81,10 +80,10 @@ def __init__(self, reader, digest, validate=True): """ Args: - reader (pyproteininference.reader.Reader): Reader object :py:class:`protein_infernece.reader.Reader` - digest (pyproteininference.in_silico_digest.Digest): Digest object - :py:class:`protein_infernece.in_silico_digest.Digest` - validate (bool): True/False to indicate if the input data should be validated + reader (Reader): Reader object [Reader][pyproteininference.reader.Reader]. + digest (Digest): Digest object + [Digest][pyproteininference.in_silico_digest.Digest]. + validate (bool): True/False to indicate if the input data should be validated. Example: >>> pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -123,13 +122,13 @@ def __init__(self, reader, digest, validate=True): def get_sorted_identifiers(self, scored=True): """ - Retrieves a sorted list of protein strings present in the analysis + Retrieves a sorted list of protein strings present in the analysis. Args: - scored (bool): True/False to indicate if we should return scored or non-scored identifiers + scored (bool): True/False to indicate if we should return scored or non-scored identifiers. Returns: - list: List of sorted protein identifier strings + list: List of sorted protein identifier strings. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -165,15 +164,15 @@ def get_sorted_identifiers(self, scored=True): @classmethod def sort_protein_group_objects(cls, protein_group_objects, higher_or_lower): """ - Class Method to sort a list of :py:class:`protein_inferenece.physical.ProteinGroup` objects by - score and number of peptides + Class Method to sort a list of [ProteinGroup][pyproteininference.physical.ProteinGroup] objects by + score and number of peptides. Args: - protein_group_objects (list): list of :py:class:`protein_inferenece.physical.ProteinGroup` objects - higher_or_lower (str): String to indicate if a "higher" or "lower" protein score is "better" + protein_group_objects (list): list of [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. + higher_or_lower (str): String to indicate if a "higher" or "lower" protein score is "better". Returns: - list: list of sorted :py:class:`protein_inferenece.physical.ProteinGroup` objects + list: list of sorted [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. Example: >>> list_of_group_objects = pyproteininference.datastore.DataStore.sort_protein_group_objects( @@ -206,15 +205,15 @@ def sort_protein_group_objects(cls, protein_group_objects, higher_or_lower): @classmethod def sort_protein_objects(cls, grouped_protein_objects, higher_or_lower): """ - Class Method to sort a list of :py:class:`protein_inferenece.physical.Protein` objects by score and number of - peptides + Class Method to sort a list of [Protein][pyproteininference.physical.Protein] objects by score and number of + peptides. Args: - grouped_protein_objects (list): list of :py:class:`protein_inferenece.physical.Protein` objects - higher_or_lower (str): String to indicate if a "higher" or "lower" protein score is "better" + grouped_protein_objects (list): list of [Protein][pyproteininference.physical.Protein] objects. + higher_or_lower (str): String to indicate if a "higher" or "lower" protein score is "better". Returns: - list: list of sorted :py:class:`protein_inferenece.physical.Protein` objects + list: list of sorted [Protein][pyproteininference.physical.Protein] objects. Example: >>> scores_grouped = pyproteininference.datastore.DataStore.sort_protein_objects( @@ -238,15 +237,15 @@ def sort_protein_objects(cls, grouped_protein_objects, higher_or_lower): @classmethod def sort_protein_sub_groups(cls, protein_list, higher_or_lower): """ - Method to sort protein sub lists + Method to sort protein sub lists. Args: - protein_list (list): List of :py:class:`protein_inferenece.physical.Protein` objects to be sorted - higher_or_lower (str): String to indicate if a "higher" or "lower" protein score is "better" + protein_list (list): List of [Protein][pyproteininference.physical.Protein] objects to be sorted. + higher_or_lower (str): String to indicate if a "higher" or "lower" protein score is "better". Returns: - list: List of :py:class:`protein_inferenece.physical.Protein` objects to be sorted by score and number of - peptides + list: List of [Protein][pyproteininference.physical.Protein] objects to be sorted by score and number of + peptides. """ @@ -270,11 +269,12 @@ def sort_protein_sub_groups(cls, protein_list, higher_or_lower): def get_psm_data(self): """ - Method to retrieve a list of :py:class:`pyproteininference.physical.Psm` objects. - Retrieves restricted data if the data has been restricted or all of the data if the data has not been restricted + Method to retrieve a list of [Psm][pyproteininference.physical.Psm] objects. + Retrieves restricted data if the data has been restricted or all of the data if the data has + not been restricted. Returns: - list: list of :py:class:`pyproteininference.physical.Psm` objects + list: list of [Psm][pyproteininference.physical.Psm] objects. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -295,12 +295,12 @@ def get_psm_data(self): def get_protein_data(self): """ - Method to retrieve a list of :py:class:`pyproteininference.physical.Protein` objects. + Method to retrieve a list of [Protein][pyproteininference.physical.Protein] objects. Retrieves picked and scored data if the data has been picked and scored or just the scored data if the data has not been picked. Returns: - list: list of :py:class:`pyproteininference.physical.Protein` objects + list: list of [Protein][pyproteininference.physical.Protein] objects. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -317,10 +317,10 @@ def get_protein_data(self): def get_protein_identifiers_from_psm_data(self): """ - Method to retrieve a list of lists of all possible protein identifiers from the psm data + Method to retrieve a list of lists of all possible protein identifiers from the psm data. Returns: - list: list of lists of protein strings + list: list of lists of protein strings. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -334,10 +334,10 @@ def get_protein_identifiers_from_psm_data(self): def get_q_values(self): """ - Method to retrieve a list of all q values for all PSMs + Method to retrieve a list of all q values for all PSMs. Returns: - list: list of floats (q values) + list: list of floats (q values). Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -351,10 +351,10 @@ def get_q_values(self): def get_pep_values(self): """ - Method to retrieve a list of all posterior error probabilities for all PSMs + Method to retrieve a list of all posterior error probabilities for all PSMs. Returns: - list: list of floats (pep values) + list: list of floats (pep values). Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -368,10 +368,10 @@ def get_pep_values(self): def get_protein_information_dictionary(self): """ - Method to retrieve a dictionary of scores for each peptide + Method to retrieve a dictionary of scores for each peptide. Returns: - dict: dictionary of scores for each protein + dict: dictionary of scores for each protein. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -398,21 +398,21 @@ def get_protein_information_dictionary(self): def restrict_psm_data(self, remove1pep=True): """ - Method to restrict the input of PSM data (:py:class:`pyproteininference.physical.Psm`) objects. + Method to restrict the input of [Psm][pyproteininference.physical.Psm] objects. This method is central to the pyproteininference module and is able to restrict the Psm data by: Q value, Pep Value, Percolator Score, Peptide Length, and Custom Score Input. - Restriction values are pulled from the :py:class:`pyproteininference.parameters.ProteinInferenceParameter` - object + Restriction values are pulled from + the [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] + object. - This method sets the :attr:`main_data_restricted` and :attr:`restricted_peptides` Attributes for the - DataStore object + This method sets the `main_data_restricted` and `restricted_peptides` Attributes for the DataStore object. Args: remove1pep (bool): True/False on whether or not to remove PEP values that equal 1 even if other restrictions are set to not restrict. Returns: - None + None: Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -517,16 +517,16 @@ def restrict_psm_data(self, remove1pep=True): def create_scoring_input(self): """ Method to create the scoring input. - This method initializes a list of :py:class:`pyproteininference.physical.Protein` objects to get them ready - to be scored by :py:mod:`pyproteininference.scoring.Score` methods + This method initializes a list of [Protein][pyproteininference.physical.Protein] objects to get them ready + to be scored by [Score][pyproteininference.scoring.Score] methods. This method also takes into account the inference type and aggregates peptides -> proteins accordingly. - This method sets the :attr:`scoring_input` and :attr:`score` Attributes for the DataStore object + This method sets the `scoring_input` and `score` Attributes for the DataStore object. - The score selected comes from the protein inference parameter object + The score selected comes from the protein inference parameter object. Returns: - None + None: Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -584,13 +584,12 @@ def create_scoring_input(self): def protein_to_peptide_dictionary(self): """ Method that returns a map of protein strings to sets of peptide strings and is essentially half - of a BiPartite graph - This method sets the :attr:`protein_peptide_dictionary` Attribute for the - :py:class:`pyproteininference.datastore.DataStore` object + of a BiPartite graph. + This method sets the `protein_peptide_dictionary` Attribute for the DataStore object. Returns: collections.defaultdict: Dictionary of protein strings (keys) that map to sets of peptide strings based - on the peptides and proteins found in the search. Protein -> set(Peptides) + on the peptides and proteins found in the search. Protein -> set(Peptides). Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -613,13 +612,12 @@ def protein_to_peptide_dictionary(self): def peptide_to_protein_dictionary(self): """ Method that returns a map of peptide strings to sets of protein strings and is essentially half of a - BiPartite graph - This method sets the :attr:`peptide_protein_dictionary` Attribute for the - :py:class:`pyproteininference.datastore.DataStore` object + BiPartite graph. + This method sets the `peptide_protein_dictionary` Attribute for the DataStore object. Returns: collections.defaultdict: Dictionary of peptide strings (keys) that map to sets of protein strings based - on the peptides and proteins found in the search. Peptide -> set(Proteins) + on the peptides and proteins found in the search. Peptide -> set(Proteins). Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -644,10 +642,10 @@ def peptide_to_protein_dictionary(self): def unique_to_leads_peptides(self): """ Method to retrieve peptides that are unique based on the data from the searches - (Not based on the database digestion) + (Not based on the database digestion). Returns: - set + set: a Set of peptide strings Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -665,14 +663,15 @@ def unique_to_leads_peptides(self): def higher_or_lower(self): """ - Method to determine if a higher or lower score is better for a given combination of score input and score type + Method to determine if a higher or lower score is better for a given combination of score input and score type. - This method sets the :attr:`high_low_better` Attribute for the DataStore object. + This method sets the `high_low_better` Attribute for the DataStore object. - This method depends on the output from the Score class to be sorted properly from best to worst score + This method depends on the output from the Score class to be sorted properly from best to worst score. Returns: - str: String indicating "higher" or "lower" depending on if a higher or lower score is a better protein score + str: String indicating "higher" or "lower" depending on if a higher or lower score is a + better protein score. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -708,13 +707,13 @@ def higher_or_lower(self): def get_protein_identifiers(self, data_form): """ - Method to retrieve the protein string identifiers + Method to retrieve the protein string identifiers. Args: - data_form (str): Can be one of the following: "main", "restricted", "picked", "picked_removed" + data_form (str): Can be one of the following: "main", "restricted", "picked", "picked_removed". Returns: - list: list of protein identifier strings + list: list of protein identifier strings. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -749,13 +748,13 @@ def get_protein_identifiers(self, data_form): def get_protein_information(self, protein_string): """ - Method to retrieve attributes for a specific scored protein + Method to retrieve attributes for a specific scored protein. Args: - protein_string (str): Protein Identifier String + protein_string (str): Protein Identifier String. Returns: - list: list of protein attributes + list: list of protein attributes. Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -799,17 +798,16 @@ def get_protein_information(self, protein_string): def exclude_non_distinguishing_peptides(self, protein_subset_type="hard"): """ - Method to Exclude peptides that are not distinguishing on either the search or database level + Method to Exclude peptides that are not distinguishing on either the search or database level. - The method sets the :attr:`scoring_input` and :attr:`restricted_peptides` variables for the - :py:class:`pyproteininference.datastore.DataStore` object + The method sets the `scoring_input` and `restricted_peptides` variables for the DataStore object. Args: protein_subset_type (str): Either "hard" or "soft". Hard will select distinguishing peptides based on the database digestion. "soft" will only use peptides identified in the search. Returns: - None + None: Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -926,18 +924,18 @@ def protein_picker(self): """ Method to run the protein picker algorithm. - Proteins must be scored first with :py:meth:`pyproteininference.scoring.Score.score_psms` + Proteins must be scored first with [score_psms][pyproteininference.scoring.Score.score_psms]. The algorithm will match target and decoy proteins identified from the PSMs from the search. If a target and matching decoy is found then target/decoy competition is performed. In the Target/Decoy pair the protein with the better score is kept and the one with the worse score is - discarded from the analysis + discarded from the analysis. - The method sets the :attr:`picked_proteins_scored` and :attr:`picked_proteins_removed` variables for - the :py:class:`pyproteininference.datastore.DataStore` object + The method sets the `picked_proteins_scored` and `picked_proteins_removed` variables for + the DataStore object. Returns: - None + None: Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -1027,16 +1025,16 @@ def protein_picker(self): def calculate_q_values(self, regular=True): """ - Method calculates Q values FDR on the lead protein in the group on the :attr:`protein_group_objects` - instance variable - FDR is calculated As (2*decoys)/total if regular is set to True and is (decoys)/total if regular is set to False + Method calculates Q values FDR on the lead protein in the group on the `protein_group_objects` + instance variable. + FDR is calculated As (2*decoys)/total if regular is set to True and is + (decoys)/total if regular is set to False. - This method updates the :attr:`protein_group_objects` for the - :py:class:`pyproteininference.datastore.DataStore` object by updating the q_value variable of the - :py:class:`pyproteininference.physical.ProteinGroup` objects + This method updates the `protein_group_objects` for the DataStore object by updating + the q_value variable of the [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. Returns: - None + None: Example: >>> data = pyproteininference.datastore.DataStore(reader = reader, digest=digest) @@ -1116,28 +1114,28 @@ def calculate_q_values(self, regular=True): def validate_psm_data(self): """ - Method that validates the PSM data + Method that validates the PSM data. """ self._validate_decoys_from_data() self._validate_isoform_from_data() def validate_digest(self): """ - Method that validates the :py:class:`pyproteininference.in_silico_digest.Digest` object + Method that validates the [Digest object][pyproteininference.in_silico_digest.Digest]. """ self._validate_reviewed_v_unreviewed() self._check_target_decoy_split() def check_data_consistency(self): """ - Method that checks for data consistency + Method that checks for data consistency. """ self._check_data_digest_overlap_psms() self._check_data_digest_overlap_proteins() def _check_data_digest_overlap_psms(self): """ - Method that logs the overlap between the digested fasta file and the input files on the PSM level + Method that logs the overlap between the digested fasta file and the input files on the PSM level. """ peptides = [x.stripped_peptide for x in self.main_data_form] peptides_in_digest = set(self.digest.peptide_to_protein_dictionary.keys()) @@ -1156,7 +1154,7 @@ def _check_data_digest_overlap_psms(self): def _check_data_digest_overlap_proteins(self): """ - Method that logs the overlap between the digested fasta file and the input files on the Protein level + Method that logs the overlap between the digested fasta file and the input files on the Protein level. """ proteins = [x.possible_proteins for x in self.main_data_form] flat_proteins = set([item for sublist in proteins for item in sublist]) @@ -1177,7 +1175,7 @@ def _check_data_digest_overlap_proteins(self): def _check_target_decoy_split(self): """ - Method that logs the number of target and decoy proteins from the digest + Method that logs the number of target and decoy proteins from the digest. """ # Check the number of targets vs the number of decoys from the digest targets = [ @@ -1195,7 +1193,7 @@ def _check_target_decoy_split(self): def _validate_decoys_from_data(self): """ - Method that checks to make sure that target and decoy proteins exist in the data files + Method that checks to make sure that target and decoy proteins exist in the data files. """ # Check to see if we find decoys from our input files proteins = [x.possible_proteins for x in self.main_data_form] @@ -1207,7 +1205,7 @@ def _validate_decoys_from_data(self): def _validate_isoform_from_data(self): """ - Method that validates whether or not isoforms are able to be identified in the data files + Method that validates whether or not isoforms are able to be identified in the data files. """ # Check to see if we find any proteins with isoform info in name in our input files proteins = [x.possible_proteins for x in self.main_data_form] @@ -1228,7 +1226,8 @@ def _validate_isoform_from_data(self): def _validate_reviewed_v_unreviewed(self): """ - Method that logs whether or not we can distinguish from reviewed and unreviewd protein identifiers in the digest + Method that logs whether or not we can distinguish from reviewed and unreviewd protein identifiers + in the digest. """ # Check to see if we get reviewed prots in digest... reviewed_proteins = len(self.digest.swiss_prot_protein_set) @@ -1242,15 +1241,15 @@ def _validate_reviewed_v_unreviewed(self): def sort_protein_strings(cls, protein_string_list, sp_proteins, decoy_symbol): """ Method that sorts protein strings in the following order: Target Reviewed, Decoy Reviewed, Target Unreviewed, - Decoy Unreviewed + Decoy Unreviewed. Args: - protein_string_list (list): List of Protein Strings - sp_proteins (set): Set of Reviewed Protein Strings - decoy_symbol (str): Symbol to denote a decoy protein identifier IE "##" + protein_string_list (list): List of Protein Strings. + sp_proteins (set): Set of Reviewed Protein Strings. + decoy_symbol (str): Symbol to denote a decoy protein identifier IE "##". Returns: - list: List of sorted protein strings + list: List of sorted protein strings. Example: >>> list_of_group_objects = datastore.DataStore.sort_protein_strings( @@ -1274,7 +1273,7 @@ def sort_protein_strings(cls, protein_string_list, sp_proteins, decoy_symbol): def input_has_q(self): """ - Method that checks to see if the input data has q values + Method that checks to see if the input data has q values. """ len_q = len([x.qvalue for x in self.main_data_form if x.qvalue]) len_all = len(self.main_data_form) @@ -1289,7 +1288,7 @@ def input_has_q(self): def input_has_pep(self): """ - Method that checks to see if the input data has pep values + Method that checks to see if the input data has pep values. """ len_pep = len([x.pepvalue for x in self.main_data_form if x.pepvalue]) len_all = len(self.main_data_form) @@ -1304,7 +1303,7 @@ def input_has_pep(self): def input_has_custom(self): """ - Method that checks to see if the input data has custom score values + Method that checks to see if the input data has custom score values. """ len_c = len([x.custom_score for x in self.main_data_form if x.custom_score]) len_all = len(self.main_data_form) @@ -1318,23 +1317,23 @@ def input_has_custom(self): return status - def get_protein_objects(self, fdr_restricted=False): + def get_protein_objects(self, false_discovery_rate=None, fdr_restricted=False): """ Method retrieves protein objects. Either retrieves FDR restricted list of protien objects, - or retrieves all objects + or retrieves all objects. Args: - fdr_restricted (bool): True/False on whether to restrict the list of objects based on FDR + fdr_restricted (bool): True/False on whether to restrict the list of objects based on FDR. Returns: - list: List of scored :py:class:`pyproteininference.physical.ProteinGroup` - objects that have been grouped and sorted. + list: List of scored [ProteinGroup][pyproteininference.physical.ProteinGroup] objects + that have been grouped and sorted. """ + if not false_discovery_rate: + false_discovery_rate = self.parameter_file_object.fdr if fdr_restricted: - protein_objects = [ - x.proteins for x in self.protein_group_objects if x.q_value <= self.parameter_file_object.fdr - ] + protein_objects = [x.proteins for x in self.protein_group_objects if x.q_value <= false_discovery_rate] else: protein_objects = self.grouped_scored_proteins @@ -1342,7 +1341,7 @@ def get_protein_objects(self, fdr_restricted=False): def _init_validate(self, reader): """ - Internal Method that checks to make sure the reader object is properly loaded and validated + Internal Method that checks to make sure the reader object is properly loaded and validated. """ if reader.psms: self.main_data_form = reader.psms # Unrestricted PSM data @@ -1356,7 +1355,7 @@ def _init_validate(self, reader): def _validate_main_data_form(self): """ - Internal Method that checks to make sure the Main data has been defined to run DataStore methods + Internal Method that checks to make sure the Main data has been defined to run DataStore methods. """ if self.main_data_form: pass @@ -1368,7 +1367,7 @@ def _validate_main_data_form(self): def _validate_main_data_restricted(self): """ - Internal Method that checks to make sure the Main data Restricted has been defined to run DataStore methods + Internal Method that checks to make sure the Main data Restricted has been defined to run DataStore methods. """ if self.main_data_restricted: pass @@ -1380,7 +1379,7 @@ def _validate_main_data_restricted(self): def _validate_scored_proteins(self): """ - Internal Method that checks to make sure that proteins have been scored to run certain subsequent methods + Internal Method that checks to make sure that proteins have been scored to run certain subsequent methods. """ if self.picked_proteins_scored or self.scored_proteins: pass @@ -1392,7 +1391,7 @@ def _validate_scored_proteins(self): def _validate_scoring_input(self): """ - Internal Method that checks to make sure that Scoring Input has been created to be able to run scoring methods + Internal Method that checks to make sure that Scoring Input has been created to be able to run scoring methods. """ if self.scoring_input: pass @@ -1404,7 +1403,7 @@ def _validate_scoring_input(self): def _validate_protein_group_objects(self): """ - Internal Method that checks to make sure inference has been run before proceeding + Internal Method that checks to make sure inference has been run before proceeding. """ if self.protein_group_objects and self.grouped_scored_proteins: pass @@ -1416,14 +1415,14 @@ def _validate_protein_group_objects(self): def generate_fdr_vs_target_hits(self, fdr_max=0.2): """ - Method for calculating FDR vs number of Target Proteins + Method for calculating FDR vs number of Target Proteins. Args: fdr_max (float): The maximum false discovery rate to calculate target hits for. - Will stop once fdr_max is reached + Will stop once fdr_max is reached. Returns: - list: List of lists of: (FDR, Number of Target Hits). Ordered by increasing number of Target Hits + list: List of lists of: (FDR, Number of Target Hits). Ordered by increasing number of Target Hits. """ fdr_vs_count = [] diff --git a/pyproteininference/export.py b/pyproteininference/export.py index 3c6fb24..e912cc5 100644 --- a/pyproteininference/export.py +++ b/pyproteininference/export.py @@ -15,11 +15,11 @@ class Export(object): """ - Class that handles exporting protein inference results to filesystem as csv files + Class that handles exporting protein inference results to filesystem as csv files. Attributes: - data (pyproteininference.datastore.DataStore): Data Class - filepath (str): Path to file to be written + data (DataStore): [DataStore object][pyproteininference.datastore.DataStore]. + filepath (str): Path to file to be written. """ @@ -49,10 +49,10 @@ class Export(object): def __init__(self, data): """ - Initialization method for the Export class + Initialization method for the Export class. Args: - data (pyproteininference.datastore.DataStore): Data Object + data (DataStore): [DataStore object][pyproteininference.datastore.DataStore]. Example: >>> export = pyproteininference.export.Export(data=data) @@ -63,18 +63,18 @@ def __init__(self, data): def export_to_csv(self, output_filename=None, directory=None, export_type="q_value"): """ - Method that dispatches to one of the many export methods given an export_type input + Method that dispatches to one of the many export methods given an export_type input. - filepath is determined based on directory arg and information from data - :py:class:`pyproteininference.datastore.DataStore` + filepath is determined based on directory arg and information from + [DataStore object][pyproteininference.datastore.DataStore]. - This method sets the :attr:`filepath` variable. + This method sets the `filepath` variable. Args: output_filename (str): Filepath to write to. If set as None will auto generate filename and - will write to directory variable - directory (str): Directory to write the result file to. If None, will write to current working directory - export_type (str): Must be a value in :attr:`EXPORT_TYPES` and determines the output format + will write to directory variable. + directory (str): Directory to write the result file to. If None, will write to current working directory. + export_type (str): Must be a value in `EXPORT_TYPES` and determines the output format. Example: >>> export = pyproteininference.export.Export(data=data) @@ -96,7 +96,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_leads_restricted(filename_out=complete_filepath) - if self.EXPORT_ALL == export_type: + elif self.EXPORT_ALL == export_type: filename = "{}_all_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -104,7 +104,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_all_restricted(complete_filepath) - if self.EXPORT_COMMA_SEP == export_type: + elif self.EXPORT_COMMA_SEP == export_type: filename = "{}_comma_sep_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -112,7 +112,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_comma_sep_restricted(complete_filepath) - if self.EXPORT_Q_VALUE_COMMA_SEP == export_type: + elif self.EXPORT_Q_VALUE_COMMA_SEP == export_type: filename = "{}_q_value_comma_sep_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -120,7 +120,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_comma_sep(complete_filepath) - if self.EXPORT_Q_VALUE == export_type: + elif self.EXPORT_Q_VALUE == export_type: filename = "{}_q_value_leads_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -128,7 +128,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_leads(complete_filepath) - if self.EXPORT_Q_VALUE_ALL == export_type: + elif self.EXPORT_Q_VALUE_ALL == export_type: filename = "{}_q_value_all_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -136,7 +136,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_all(complete_filepath) - if self.EXPORT_PEPTIDES == export_type: + elif self.EXPORT_PEPTIDES == export_type: filename = "{}_q_value_leads_peptides_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -144,7 +144,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_leads_peptides(complete_filepath) - if self.EXPORT_PSMS == export_type: + elif self.EXPORT_PSMS == export_type: filename = "{}_q_value_leads_psms_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -152,7 +152,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_leads_psms(complete_filepath) - if self.EXPORT_PSM_IDS == export_type: + elif self.EXPORT_PSM_IDS == export_type: filename = "{}_q_value_leads_psm_ids_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -160,7 +160,7 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_leads_psm_ids(complete_filepath) - if self.EXPORT_LONG == export_type: + elif self.EXPORT_LONG == export_type: filename = "{}_q_value_long_{}_{}.csv".format(tag, data.short_protein_score, data.psm_score) complete_filepath = os.path.join(directory, filename) if output_filename: @@ -168,15 +168,16 @@ def export_to_csv(self, output_filename=None, directory=None, export_type="q_val logger.info("Exporting Protein Inference Data to File: {}".format(complete_filepath)) self.csv_export_q_value_leads_long(complete_filepath) + else: + complete_filepath = "protein_inference_results.csv" + self.filepath = complete_filepath def csv_export_all_restricted(self, filename_out): """ Method that outputs a subset of the passing proteins based on FDR. - Only Proteins that pass FDR will be output and ALL proteins - will be all output not just leads. - This method returns a non-square CSV file + This method returns a non-square CSV file. Args: filename_out (str): Filename for the data to be written to @@ -215,10 +216,10 @@ def csv_export_leads_restricted(self, filename_out): Method that outputs a subset of the passing proteins based on FDR. Only Proteins that pass FDR will be output and only Lead proteins will be output - This method returns a non-square CSV file + This method returns a non-square CSV file. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. """ protein_objects = self.data.get_protein_objects(fdr_restricted=True) @@ -252,12 +253,12 @@ def csv_export_comma_sep_restricted(self, filename_out): """ Method that outputs a subset of the passing proteins based on FDR. Only Proteins that pass FDR will be output and only Lead proteins will be output. - Proteins in the groups of lead proteins will also be output in the same row as the lead + Proteins in the groups of lead proteins will also be output in the same row as the lead. - This method returns a non-square CSV file + This method returns a non-square CSV file. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. """ protein_objects = self.data.get_protein_objects(fdr_restricted=True) @@ -292,10 +293,10 @@ def csv_export_q_value_leads(self, filename_out): """ Method that outputs all lead proteins with Q values. - This method returns a non-square CSV file + This method returns a non-square CSV file. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. """ protein_export_list = [ @@ -331,12 +332,12 @@ def csv_export_q_value_leads(self, filename_out): def csv_export_q_value_comma_sep(self, filename_out): """ Method that outputs all lead proteins with Q values. - Proteins in the groups of lead proteins will also be output in the same row as the lead + Proteins in the groups of lead proteins will also be output in the same row as the lead. - This method returns a non-square CSV file + This method returns a non-square CSV file. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. """ protein_export_list = [ @@ -371,13 +372,13 @@ def csv_export_q_value_comma_sep(self, filename_out): def csv_export_q_value_all(self, filename_out): """ Method that outputs all proteins with Q values. - Non Lead proteins are also output - entire group gets output - Proteins in the groups of lead proteins will also be output in the same row as the lead + Non Lead proteins are also output - entire group gets output. + Proteins in the groups of lead proteins will also be output in the same row as the lead. - This method returns a non-square CSV file + This method returns a non-square CSV file. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. """ protein_export_list = [ @@ -443,10 +444,10 @@ def csv_export_q_value_leads_long(self, filename_out): """ Method that outputs all lead proteins with Q values. - This method returns a long formatted result file with one peptide on each row + This method returns a long formatted result file with one peptide on each row. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. """ protein_export_list = [ @@ -481,12 +482,12 @@ def csv_export_q_value_leads_long(self, filename_out): def csv_export_q_value_leads_peptides(self, filename_out, peptide_delimiter=" "): """ Method that outputs all lead proteins with Q values in rectangular format. - This method outputs unique peptides per protein + This method outputs unique peptides per protein. - This method returns a rectangular CSV file + This method returns a rectangular CSV file. Args: - filename_out (str): Filename for the data to be written to + filename_out (str): Filename for the data to be written to. peptide_delimiter (str): String to separate peptides by in the "Peptides" column of the csv file """ protein_export_list = [ @@ -521,13 +522,13 @@ def csv_export_q_value_leads_peptides(self, filename_out, peptide_delimiter=" ") def csv_export_q_value_leads_psms(self, filename_out, peptide_delimiter=" "): """ Method that outputs all lead proteins with Q values in rectangular format. - This method outputs all PSMs for the protein not just unique peptide identifiers + This method outputs all PSMs for the protein not just unique peptide identifiers. - This method returns a rectangular CSV file + This method returns a rectangular CSV file. Args: - filename_out (str): Filename for the data to be written to - peptide_delimiter (str): String to separate peptides by in the "Peptides" column of the csv file + filename_out (str): Filename for the data to be written to. + peptide_delimiter (str): String to separate peptides by in the "Peptides" column of the csv file. """ protein_export_list = [ [ @@ -563,11 +564,11 @@ def csv_export_q_value_leads_psm_ids(self, filename_out, peptide_delimiter=" "): Method that outputs all lead proteins with Q values in rectangular format. Psms are output as the psm_id value. So sequence information is not output. - This method returns a rectangular CSV file + This method returns a rectangular CSV file. Args: - filename_out (str): Filename for the data to be written to - peptide_delimiter (str): String to separate psm_ids by in the "Peptides" column of the csv file + filename_out (str): Filename for the data to be written to. + peptide_delimiter (str): String to separate psm_ids by in the "Peptides" column of the csv file. """ protein_export_list = [ [ diff --git a/pyproteininference/heuristic.py b/pyproteininference/heuristic.py index 3d6fc57..bc81321 100644 --- a/pyproteininference/heuristic.py +++ b/pyproteininference/heuristic.py @@ -1,10 +1,12 @@ import copy +import collections import logging import os import sys import matplotlib.pyplot as plt import numpy +import statistics import pyproteininference from pyproteininference.inference import Inference @@ -22,37 +24,36 @@ class HeuristicPipeline(ProteinInferencePipeline): """ - This is the Protein Inference Heuristic classs which houses the logic to run the Protein Inference Heuristic method - to determine the best inference method for the given data - Logic is executed in the :py:meth:`pyproteininference.heuristic.HeuristicPipeline.execute` method + This is the Protein Inference Heuristic class which houses the logic to run the Protein Inference Heuristic method + to determine the best inference method for the given data. + Logic is executed in the [execute][pyproteininference.heuristic.HeuristicPipeline.execute] method. Attributes: - parameter_file (str): Path to Protein Inference Yaml Parameter File - database_file (str): Path to Fasta database used in proteomics search - target_files (str/list): Path to Target Psm File (Or a list of files) - decoy_files (str/list): Path to Decoy Psm File (Or a list of files) - combined_files (str/list): Path to Combined Psm File (Or a list of files) - target_directory (str): Path to Directory containing Target Psm Files - decoy_directory (str): Path to Directory containing Decoy Psm Files - combined_directory (str): Path to Directory containing Combined Psm Files - output_directory (str): Path to Directory where output will be written - output_filename (str): Path to Filename where output will be written. Will override output_directory + parameter_file (str): Path to Protein Inference Yaml Parameter File. + database_file (str): Path to Fasta database used in proteomics search. + target_files (str/list): Path to Target Psm File (Or a list of files). + decoy_files (str/list): Path to Decoy Psm File (Or a list of files). + combined_files (str/list): Path to Combined Psm File (Or a list of files). + target_directory (str): Path to Directory containing Target Psm Files. + decoy_directory (str): Path to Directory containing Decoy Psm Files. + combined_directory (str): Path to Directory containing Combined Psm Files. + output_directory (str): Path to Directory where output will be written. + output_filename (str): Path to Filename where output will be written. Will override output_directory. id_splitting (bool): True/False on whether to split protein IDs in the digest. - Leave as False unless you know what you are doing + Advanced usage only. append_alt_from_db (bool): True/False on whether to append - alternative proteins from the DB digestion in Reader class - roc_plot_filepath (str): Filepath to be written to by Heuristic Plotting method. - This is optional and a default filename will be created in output_directory if this is left as None - fdr_max (float): The Maximum FDR to display on the ROC Plot generated - to compare inference methods - inference_method_list: (list) List of inference methods used in heuristic determination - datastore_dict: (dict) Dictionary of :py:class:`pyproteininference.datastore.DataStore` - objects generated in heuristic determination with the inference method as the key of each entry - selected_method: (str) String representation of the selected inference method based on the heuristic - heuristic: (float) Heuristic Value as determined from the data - selected_datastore: (:py:class:`pyproteininference.datastore.DataStore`) - The DataStore object as selected by the heuristic - output_type: (str) How to output results. Can either be "all" or "optimal". Will either output all results + alternative proteins from the DB digestion in Reader class. + pdf_filename (str): Filepath to be written to by Heuristic Plotting method. + This is optional and a default filename will be created in output_directory if this is left as None. + inference_method_list (list): List of inference methods used in heuristic determination. + datastore_dict (dict): Dictionary of [DataStore][pyproteininference.datastore.DataStore] + objects generated in heuristic determination with the inference method as the key of each entry. + selected_methods (list): a list of String representations of the selected inference methods based on the + heuristic. + selected_datastores (dict): + a Dictionary of [DataStore object][pyproteininference.datastore.DataStore] objects as selected by the + heuristic. + output_type (str): How to output results. Can either be "all" or "optimal". Will either output all results or will only output the optimal results. """ @@ -74,37 +75,34 @@ def __init__( output_filename=None, id_splitting=False, append_alt_from_db=True, - roc_plot_filepath=None, - fdr_max=0.2, + pdf_filename=None, output_type="all", ): """ Args: - parameter_file (str): Path to Protein Inference Yaml Parameter File - database_file (str): Path to Fasta database used in proteomics search - target_files (str/list): Path to Target Psm File (Or a list of files) - decoy_files (str/list): Path to Decoy Psm File (Or a list of files) - combined_files (str/list): Path to Combined Psm File (Or a list of files) - target_directory (str): Path to Directory containing Target Psm Files - decoy_directory (str): Path to Directory containing Decoy Psm Files - combined_directory (str): Path to Directory containing Combined Psm Files - output_directory (str): Path to Directory where output will be written + parameter_file (str): Path to Protein Inference Yaml Parameter File. + database_file (str): Path to Fasta database used in proteomics search. + target_files (str/list): Path to Target Psm File (Or a list of files). + decoy_files (str/list): Path to Decoy Psm File (Or a list of files). + combined_files (str/list): Path to Combined Psm File (Or a list of files). + target_directory (str): Path to Directory containing Target Psm Files. + decoy_directory (str): Path to Directory containing Decoy Psm Files. + combined_directory (str): Path to Directory containing Combined Psm Files. + output_directory (str): Path to Directory where output will be written. output_filename (str): Path to Filename where output will be written. - Will override output_directory + Will override output_directory. id_splitting (bool): True/False on whether to split protein IDs in the digest. - Leave as False unless you know what you are doing + Advanced usage only. append_alt_from_db (bool): True/False on whether to append alternative proteins - from the DB digestion in Reader class - roc_plot_filepath (str): Filepath to be written to by Heuristic Plotting method. + from the DB digestion in Reader class. + pdf_filename (str): Filepath to be written to by Heuristic Plotting method. This is optional and a default filename will be created in output_directory if this is left as None - fdr_max (float): The Maximum FDR to display on the ROC Plot generated to compare - inference methods - output_type: (str) How to output results. Can either be "all" or "optimal". Will either output all results + output_type (str): How to output results. Can either be "all" or "optimal". Will either output all results or will only output the optimal results. Returns: - object: + HeuristicPipeline: [HeuristicPipeline][pyproteininference.heuristic.HeuristicPipeline] object Example: >>> heuristic = pyproteininference.heuristic.HeuristicPipeline( @@ -119,8 +117,7 @@ def __init__( >>> output_directory=dir_name, >>> output_filename=output_filename, >>> append_alt_from_db=append_alt, - >>> roc_plot_filepath=roc_plot_filepath, - >>> fdr_max=0.2, + >>> pdf_filename=pdf_filename, >>> output_type="all" >>> ) """ @@ -137,20 +134,19 @@ def __init__( self.output_filename = output_filename self.id_splitting = id_splitting self.append_alt_from_db = append_alt_from_db - self.fdr_max = fdr_max self.output_type = output_type if self.output_type not in self.OUTPUT_TYPES: raise ValueError("The variable output_type must be set to either 'all' or 'optimal'") - if not roc_plot_filepath: + if not pdf_filename: if self.output_directory and not self.output_filename: - self.roc_plot_filepath = os.path.join(self.output_directory, "roc_plot.pdf") + self.pdf_filename = os.path.join(self.output_directory, "heuristic_plot.pdf") elif self.output_filename: - self.roc_plot_filepath = os.path.join(os.path.split(self.output_filename)[0], "roc_plot.pdf") + self.pdf_filename = os.path.join(os.path.split(self.output_filename)[0], "heuristic_plot.pdf") else: - self.roc_plot_filepath = os.path.join(os.getcwd(), "roc_plot.pdf") + self.pdf_filename = os.path.join(os.getcwd(), "heuristic_plot.pdf") else: - self.roc_plot_filepath = roc_plot_filepath + self.pdf_filename = pdf_filename self.inference_method_list = [ Inference.INCLUSION, @@ -159,9 +155,8 @@ def __init__( Inference.PEPTIDE_CENTRIC, ] self.datastore_dict = {} - self.heuristic = None - self.selected_method = None - self.selected_datastore = None + self.selected_methods = None + self.selected_datastores = {} self._validate_input() @@ -169,22 +164,21 @@ def __init__( self._log_append_alt_from_db() - def execute(self, fdr_threshold=0.01, skip_plot=False): + def execute(self, fdr_threshold=0.05): """ - This method is the main driver of the heuristic method - This method calls other classes and methods that make up the heuristic pipeline + This method is the main driver of the heuristic method. + This method calls other classes and methods that make up the heuristic pipeline. This includes but is not limited to: - 1. Loops over the main inference methods: Inclusion, Exclusion, Parsimony, and Peptide Centric - 2. Determines the optimal inference method based on the input data as well as the database file - 3. Outputs the results and indicates the optimal results + 1. Loops over the main inference methods: Inclusion, Exclusion, Parsimony, and Peptide Centric. + 2. Determines the optimal inference method based on the input data as well as the database file. + 3. Outputs the results and indicates the optimal results. Args: - fdr_threshold (float): The Qvalue/FDR threshold the heuristic method uses to base calculations from - skip_plot (bool): True/False on whether to skip outputting ROC Plot + fdr_threshold (float): The Qvalue/FDR threshold the heuristic method uses to base calculations from. Returns: - None + None: Example: >>> heuristic = pyproteininference.heuristic.HeuristicPipeline( @@ -199,11 +193,10 @@ def execute(self, fdr_threshold=0.01, skip_plot=False): >>> output_directory=dir_name, >>> output_filename=output_filename, >>> append_alt_from_db=append_alt, - >>> roc_plot_filepath=roc_plot_filepath, - >>> fdr_max=0.2, + >>> pdf_filename=pdf_filename, >>> output_type="all" >>> ) - >>> heuristic.execute(fdr_threshold=0.01) + >>> heuristic.execute(fdr_threshold=0.05) """ @@ -235,7 +228,6 @@ def execute(self, fdr_threshold=0.01, skip_plot=False): logger.info("Overriding inference type {}".format(method_specific_parameters.inference_type)) method_specific_parameters.inference_type = inference_method - method_specific_parameters.fdr = fdr_threshold logger.info("New inference type {}".format(method_specific_parameters.inference_type)) logger.info("FDR Threshold Set to {}".format(method_specific_parameters.fdr)) @@ -275,8 +267,10 @@ def execute(self, fdr_threshold=0.01, skip_plot=False): self.datastore_dict[inference_method] = data - self.selected_method = self.determine_optimal_inference_method() - self.selected_datastore = self.datastore_dict[self.selected_method] + self.selected_methods = self.determine_optimal_inference_method( + false_discovery_rate_threshold=fdr_threshold, pdf_filename=self.pdf_filename + ) + self.selected_datastores = {x: self.datastore_dict[x] for x in self.selected_methods} if self.output_type == "all": self._write_all_results(parameters=method_specific_parameters) @@ -285,366 +279,16 @@ def execute(self, fdr_threshold=0.01, skip_plot=False): else: self._write_optimal_results(parameters=method_specific_parameters) - if not skip_plot: - self.generate_roc_plot(fdr_max=self.fdr_max, pdf_filename=self.roc_plot_filepath) - - else: - logger.info("skip_plot is set to True. Not creating ROC Plot.") - - def determine_optimal_inference_method(self, empirical_threshold=0.2): - """ - This method determines the optimal inference method from Inclusion, Exclusion, Parsimony, Peptide-Centric - - Args: - empirical_threshold (float): Threshold used for cutoffs for the heuristic algorithm - - Returns: - str: String representation of the selected inference method - - """ - - # Get the number of passing proteins - filtered_protein_objects = { - x: self.datastore_dict[x].get_protein_objects(fdr_restricted=True) for x in self.datastore_dict.keys() - } - number_passing_proteins = {x: len(filtered_protein_objects[x]) for x in filtered_protein_objects.keys()} - - logger.info("Number of Passing Proteins per Inference Method") - logger.info(number_passing_proteins) - - # Calculate how similar the number of passing proteins is for each method - similarity_dict = {} - for key in number_passing_proteins.keys(): - cur_value = number_passing_proteins[key] - other_values = [x for x in number_passing_proteins.values() if x != cur_value] - similarity_dict[key] = cur_value / (numpy.mean(other_values)) - - # Simple transformation for getting max below - diff_dict = {x: abs(1 - similarity_dict[x]) for x in number_passing_proteins.keys()} - - logger.info("Initial Heuristic Scores") - logger.info(diff_dict) - - # Remove the most dissimilar method, which is the max - key_to_delete = max(diff_dict, key=lambda k: diff_dict[k]) - logger.info("Removing {} with score {}".format(key_to_delete, diff_dict[key_to_delete])) - del diff_dict[key_to_delete] - del number_passing_proteins[key_to_delete] - - # Redo above on the restricted set of 3 methods - pared_similarity_dict = {} - for key in number_passing_proteins.keys(): - cur_value = number_passing_proteins[key] - other_values = [x for x in number_passing_proteins.values() if x != cur_value] - pared_similarity_dict[key] = cur_value / (numpy.mean(other_values)) - - pared_diff_dict = {x: abs(1 - pared_similarity_dict[x]) for x in number_passing_proteins.keys()} - - logger.info("Final Heuristic Scores") - logger.info(pared_diff_dict) - - # Remove Inclusion and Exclusion if they are poor. IE if their heuristic scores are above an - # empirical threshold value - # .2 was determined as a proper threshold in testing different databases - # (Uniprot, Swissprot, Swissprot no isoforms) - if Inference.EXCLUSION in pared_diff_dict.keys(): - if pared_diff_dict[Inference.EXCLUSION] <= empirical_threshold: - logger.info( - "Keeping {} with score {}".format(Inference.EXCLUSION, pared_diff_dict[Inference.EXCLUSION]) - ) - - else: - # If not the remove it - logger.info( - "Removing {} with score {}".format(Inference.EXCLUSION, pared_diff_dict[Inference.EXCLUSION]) - ) - del pared_diff_dict[Inference.EXCLUSION] - del number_passing_proteins[Inference.EXCLUSION] - - if Inference.INCLUSION in pared_diff_dict.keys(): - if pared_diff_dict[Inference.INCLUSION] <= empirical_threshold: - logger.info( - "Keeping {} with score {}".format(Inference.INCLUSION, pared_diff_dict[Inference.INCLUSION]) - ) - - else: - # If not then remove it - logger.info( - "Removing {} with score {}".format(Inference.INCLUSION, pared_diff_dict[Inference.INCLUSION]) - ) - del pared_diff_dict[Inference.INCLUSION] - del number_passing_proteins[Inference.INCLUSION] - - remaining_inference_methods = list(pared_diff_dict.keys()) - - # At this point we have 3, 2 or 1 inference types remaining... So lets do branching if statement for - # all possible combinations - # Each combination will have different rules based on empirical knowledgee - if len(remaining_inference_methods) == 3: - if set([Inference.PARSIMONY, Inference.EXCLUSION, Inference.INCLUSION]) == set(remaining_inference_methods): - # If inclusion is over double parsimony remove it - if ( - number_passing_proteins[Inference.INCLUSION] / self.RATIO_CONSTANT - > number_passing_proteins[Inference.PARSIMONY] - ): - logger.info( - "Removing {} with score {}".format(Inference.INCLUSION, pared_diff_dict[Inference.INCLUSION]) - ) - del pared_diff_dict[Inference.INCLUSION] - del number_passing_proteins[Inference.INCLUSION] - - # If exclusion is less than half of parsimony remove it... - if ( - number_passing_proteins[Inference.EXCLUSION] * self.RATIO_CONSTANT - < number_passing_proteins[Inference.PARSIMONY] - ): - logger.info( - "Removing {} with score {}".format(Inference.EXCLUSION, pared_diff_dict[Inference.EXCLUSION]) - ) - del pared_diff_dict[Inference.EXCLUSION] - del number_passing_proteins[Inference.EXCLUSION] - - if len(pared_diff_dict.keys()) == 3: - # if neither are removed select Exclusion. Since all methods are close to parsimony take exclusion - # because it wouldnt have removed too many hits - selected_method = Inference.EXCLUSION - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.PARSIMONY, Inference.PEPTIDE_CENTRIC, Inference.INCLUSION]) == set( - remaining_inference_methods - ): - # If inclusion is double of parsimony remove it... - if ( - number_passing_proteins[Inference.INCLUSION] / self.RATIO_CONSTANT - > number_passing_proteins[Inference.PARSIMONY] - ): - logger.info( - "Removing {} with score {}".format(Inference.INCLUSION, pared_diff_dict[Inference.INCLUSION]) - ) - del pared_diff_dict[Inference.INCLUSION] - del number_passing_proteins[Inference.INCLUSION] - - # Check to see if inclusion is still present - if Inference.INCLUSION in pared_diff_dict.keys(): - # If inclusion is still present and double peptide-centric remove it... - if ( - number_passing_proteins[Inference.INCLUSION] / self.RATIO_CONSTANT - > number_passing_proteins[Inference.PEPTIDE_CENTRIC] - ): - logger.info( - "Removing {} with score {}".format( - Inference.INCLUSION, - pared_diff_dict[Inference.INCLUSION], - ) - ) - del pared_diff_dict[Inference.INCLUSION] - del number_passing_proteins[Inference.INCLUSION] - - if len(pared_diff_dict.keys()) == 3: - # if Inclusion is not removed select Inclusion. - selected_method = Inference.INCLUSION - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.PARSIMONY, Inference.EXCLUSION, Inference.PEPTIDE_CENTRIC]) == set( - remaining_inference_methods - ): - # If exclusion is less than half of parsimony remove it... - if ( - number_passing_proteins[Inference.EXCLUSION] * self.RATIO_CONSTANT - < number_passing_proteins[Inference.PARSIMONY] - ): - logger.info( - "Removing {} with score {}".format(Inference.EXCLUSION, pared_diff_dict[Inference.EXCLUSION]) - ) - del pared_diff_dict[Inference.EXCLUSION] - del number_passing_proteins[Inference.EXCLUSION] - - # Check to see if exclusion is still present - if Inference.EXCLUSION in pared_diff_dict.keys(): - # If exclusion is still present and less than half of peptide-centric remove it... - if ( - pared_diff_dict[Inference.EXCLUSION] * self.RATIO_CONSTANT - < pared_diff_dict[Inference.PEPTIDE_CENTRIC] - ): - logger.info( - "Removing {} with score {}".format( - Inference.EXCLUSION, - pared_diff_dict[Inference.EXCLUSION], - ) - ) - del pared_diff_dict[Inference.EXCLUSION] - del number_passing_proteins[Inference.EXCLUSION] - - if len(pared_diff_dict.keys()) == 3: - # if Exclusion is not removed select Exclusion. Since it is close to parsimony and peptide-centric - # take exclusion because it wouldnt have removed too many hits - selected_method = Inference.EXCLUSION - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.PEPTIDE_CENTRIC, Inference.EXCLUSION, Inference.INCLUSION]) == set( - remaining_inference_methods - ): - # If inclusion is over double peptide-centric remove it - if ( - number_passing_proteins[Inference.INCLUSION] / self.RATIO_CONSTANT - > number_passing_proteins[Inference.PEPTIDE_CENTRIC] - ): - logger.info( - "Removing {} with score {}".format(Inference.INCLUSION, pared_diff_dict[Inference.INCLUSION]) - ) - del pared_diff_dict[Inference.INCLUSION] - del number_passing_proteins[Inference.INCLUSION] - - # If exclusion is less than half of peptide-centric remove it... - if ( - pared_diff_dict[Inference.EXCLUSION] * self.RATIO_CONSTANT - < pared_diff_dict[Inference.PEPTIDE_CENTRIC] - ): - logger.info( - "Removing {} with score {}".format(Inference.EXCLUSION, pared_diff_dict[Inference.EXCLUSION]) - ) - del pared_diff_dict[Inference.EXCLUSION] - del number_passing_proteins[Inference.EXCLUSION] - - if len(pared_diff_dict.keys()) == 3: - # if neither are removed select Exclusion. Since both are close to peptide-centric take exclusion - # because it wouldnt have removed too many hits - selected_method = Inference.EXCLUSION - # If we have one remaining just return it - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - else: - pass - - remaining_inference_methods = list(pared_diff_dict.keys()) - - if len(remaining_inference_methods) == 2: - if set([Inference.PEPTIDE_CENTRIC, Inference.EXCLUSION]) == set(remaining_inference_methods): - # Take peptide centric - selected_method = Inference.PEPTIDE_CENTRIC - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.PEPTIDE_CENTRIC, Inference.INCLUSION]) == set(remaining_inference_methods): - # Take peptide centric - selected_method = Inference.PEPTIDE_CENTRIC - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.PEPTIDE_CENTRIC, Inference.PARSIMONY]) == set(remaining_inference_methods): - - # First If peptide centric is less than parsimony return parsimony - if number_passing_proteins[Inference.PEPTIDE_CENTRIC] < number_passing_proteins[Inference.PARSIMONY]: - selected_method = Inference.PARSIMONY - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - # check to see if parsimony and peptide centric are close... If half of peptide centric is greater - # than parsimony pick parsimony - if ( - number_passing_proteins[Inference.PEPTIDE_CENTRIC] / self.RATIO_CONSTANT - > number_passing_proteins[Inference.PARSIMONY] - ): - selected_method = Inference.PARSIMONY - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - else: - # If not (Meaning the values are close... return peptide-centric) - selected_method = Inference.PEPTIDE_CENTRIC - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.EXCLUSION, Inference.INCLUSION]) == set(remaining_inference_methods): - # This situation should never occur - selected_method = Inference.INCLUSION - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - pass - - elif set([Inference.EXCLUSION, Inference.PARSIMONY]) == set(remaining_inference_methods): - - # First If exclusion is greater than parsimony return exclusion - # This is highly unlikely to happen - if number_passing_proteins[Inference.EXCLUSION] > number_passing_proteins[Inference.PARSIMONY]: - selected_method = Inference.EXCLUSION - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - # check to see if parsimony and exclusion are close... If half of parsimony is greater than exclusion - # pick parsimony - # This means that exclusion is removing a lot of peptides - if ( - number_passing_proteins[Inference.PARSIMONY] / self.RATIO_CONSTANT - > number_passing_proteins[Inference.EXCLUSION] - ): - selected_method = Inference.PARSIMONY - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - else: - # If not (Meaning the values are close... return exclusion) - selected_method = Inference.EXCLUSION - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - elif set([Inference.PARSIMONY, Inference.INCLUSION]) == set(remaining_inference_methods): - # take parsimony - selected_method = Inference.PARSIMONY - logger.info( - "Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method]) - ) - return selected_method - - remaining_inference_methods = list(pared_diff_dict.keys()) - - if len(remaining_inference_methods) == 1: - selected_method = list(pared_diff_dict.keys())[0] - # If we have one remaining just return it - logger.info("Inference {} Selected with score {}".format(selected_method, pared_diff_dict[selected_method])) - return selected_method - - if len(remaining_inference_methods) == 0: - raise ValueError("Not able to determine optimal Inference Method for your dataset") - def generate_roc_plot(self, fdr_max=0.2, pdf_filename=None): """ - This method produces a PDF ROC plot overlaying the 4 inference methods apart of the heuristic algorithm + This method produces a PDF ROC plot overlaying the 4 inference methods apart of the heuristic algorithm. Args: - fdr_max (float): Max FDR to display on the plot - pdf_filename (str): Filename to write roc plot to + fdr_max (float): Max FDR to display on the plot. + pdf_filename (str): Filename to write roc plot to. Returns: - None + None: """ f = plt.figure() @@ -654,7 +298,7 @@ def generate_roc_plot(self, fdr_max=0.2, pdf_filename=None): target_hits = [x[1] for x in fdr_vs_target_hits] plt.plot(fdrs, target_hits, '-', label=inference_method.replace("_", " ")) target_fdr = self.datastore_dict[inference_method].parameter_file_object.fdr - if inference_method == self.selected_method: + if inference_method in self.selected_methods: best_value = min(fdrs, key=lambda x: abs(x - target_fdr)) best_index = fdrs.index(best_value) best_target_hit_value = target_hits[best_index] # noqa F841 @@ -673,10 +317,11 @@ def generate_roc_plot(self, fdr_max=0.2, pdf_filename=None): def _write_all_results(self, parameters): """ - Internal method that loops over all results and writes them out + Internal method that loops over all results and writes them out. """ for method in list(self.datastore_dict.keys()): - if method == self.selected_method: + datastore = self.datastore_dict[method] + if method in self.selected_methods: inference_method_string = "{}_{}".format(method, "optimal_method") else: inference_method_string = method @@ -684,7 +329,14 @@ def _write_all_results(self, parameters): # If a filename is not provided then construct one using output_directory # Note: output_directory will always get set even if its set as None - gets set to cwd inference_filename = os.path.join( - self.output_directory, "{}_{}".format(inference_method_string, "protein_inference_results.csv") + self.output_directory, + "{}_{}_{}_{}_{}".format( + inference_method_string, + parameters.tag, + datastore.short_protein_score, + datastore.psm_score, + "protein_inference_results.csv", + ), ) if self.output_filename: # If the user specified an output filename then split it apart and insert the inference method @@ -702,26 +354,219 @@ def _write_all_results(self, parameters): def _write_optimal_results(self, parameters): """ - Internal method that writes out the optimized results + Internal method that writes out the optimized results. """ - inference_method_string = "{}_{}".format(self.selected_method, "optimal_method") - if not self.output_filename and self.output_directory: - # If a filename is not provided then construct one using output_directory - # Note: output_directory will always get set even if its set as None - gets set to cwd - inference_filename = os.path.join( - self.output_directory, "{}_{}".format(inference_method_string, "protein_inference_results.csv") + for method in self.selected_methods: + datastore = self.datastore_dict[method] + inference_method_string = "{}_{}".format(method, "optimal_method") + if not self.output_filename and self.output_directory: + # If a filename is not provided then construct one using output_directory + # Note: output_directory will always get set even if its set as None - gets set to cwd + inference_filename = os.path.join( + self.output_directory, + "{}_{}_{}_{}_{}".format( + inference_method_string, + parameters.tag, + datastore.short_protein_score, + datastore.psm_score, + "protein_inference_results.csv", + ), + ) + if self.output_filename: + # If the user specified an output filename then split it apart and insert the inference method + # Then reconstruct the file + split = os.path.split(self.output_filename) + path = split[0] + filename = split[1] + inference_filename = os.path.join(path, "{}_{}".format(inference_method_string, filename)) + export = pyproteininference.export.Export(data=self.selected_datastores[method]) + export.export_to_csv( + output_filename=inference_filename, + directory=self.output_directory, + export_type=parameters.export, ) - if self.output_filename: - # If the user specified an output filename then split it apart and insert the inference method - # Then reconstruct the file - split = os.path.split(self.output_filename) - path = split[0] - filename = split[1] - inference_filename = os.path.join(path, "{}_{}".format(inference_method_string, filename)) - export = pyproteininference.export.Export(data=self.selected_datastore) - export.export_to_csv( - output_filename=inference_filename, - directory=self.output_directory, - export_type=parameters.export, + + def determine_optimal_inference_method( + self, + false_discovery_rate_threshold=0.05, + upper_empirical_threshold=1, + lower_empirical_threshold=0.5, + pdf_filename=None, + ): + """ + This method determines the optimal inference method from Inclusion, Exclusion, Parsimony, Peptide-Centric. + + Args: + false_discovery_rate_threshold (float): The fdr threshold to use in heuristic algorithm - + This parameter determines the maximum fdr used when creating a range of finite FDR values. + upper_empirical_threshold (float): Upper Threshold used for parsimony/peptide centric cutoff for + the heuristic algorithm. + lower_empirical_threshold (float): Lower Threshold used for inclusion/exclusion cutoff for + the heuristic algorithm. + pdf_filename (str): Filename to write heuristic density plot to. + + + Returns: + list: List of string representations of the recommended inference methods. + + """ + + # Get the number of passing proteins + number_stdev_from_mean_dict = {} + fdrs = [false_discovery_rate_threshold * 0.01 * x for x in range(100)] + for fdr in fdrs: + stdev_from_mean = self.determine_number_stdev_from_mean(false_discovery_rate=fdr) + number_stdev_from_mean_dict[fdr] = stdev_from_mean + + stdev_collection = collections.defaultdict(list) + for fdr in fdrs: + for key in number_stdev_from_mean_dict[fdr]: + stdev_collection[key].append(number_stdev_from_mean_dict[fdr][key]) + + heuristic_scores = self.generate_density_plot( + number_stdevs_from_mean=stdev_collection, pdf_filename=pdf_filename ) + + # Apply conditional statement with lower and upper thresholds + if ( + heuristic_scores[Inference.PARSIMONY] <= lower_empirical_threshold + or heuristic_scores[Inference.PEPTIDE_CENTRIC] <= lower_empirical_threshold + ): + # If parsimony or peptide centric are less than the lower empirical threshold + # Then select the best method of the two + logger.info( + "Either parsimony {} or peptide centric {} pass empirical threshold {}. " + "Selecting the best method of the two.".format( + heuristic_scores[Inference.PARSIMONY], + heuristic_scores[Inference.PEPTIDE_CENTRIC], + lower_empirical_threshold, + ) + ) + sub_dict = { + Inference.PARSIMONY: heuristic_scores[Inference.PARSIMONY], + Inference.PEPTIDE_CENTRIC: heuristic_scores[Inference.PEPTIDE_CENTRIC], + } + + if ( + heuristic_scores[Inference.PARSIMONY] <= lower_empirical_threshold + and heuristic_scores[Inference.PEPTIDE_CENTRIC] <= lower_empirical_threshold + ): + # If both are under the threshold return both + selected_methods = [Inference.PARSIMONY, Inference.PEPTIDE_CENTRIC] + + else: + selected_methods = [min(sub_dict, key=sub_dict.get)] + + # If the above condition does not apply + elif ( + heuristic_scores[Inference.EXCLUSION] <= upper_empirical_threshold + or heuristic_scores[Inference.INCLUSION] <= upper_empirical_threshold + ): + # If exclusion or inclusion are less than the upper empirical threshold + # Then select the best method of the two + logger.info( + "Either inclusion {} or exclusion {} pass empirical threshold {}. " + "Selecting the best method of the two.".format( + heuristic_scores[Inference.INCLUSION], + heuristic_scores[Inference.EXCLUSION], + upper_empirical_threshold, + ) + ) + sub_dict = { + Inference.EXCLUSION: heuristic_scores[Inference.EXCLUSION], + Inference.INCLUSION: heuristic_scores[Inference.INCLUSION], + } + + if ( + heuristic_scores[Inference.EXCLUSION] <= upper_empirical_threshold + and heuristic_scores[Inference.INCLUSION] <= upper_empirical_threshold + ): + # If both are under the threshold return both + selected_methods = [Inference.INCLUSION, Inference.EXCLUSION] + + else: + selected_methods = [min(sub_dict, key=sub_dict.get)] + + else: + # If we have no conditional scenarios... + # Select the best method + logger.info("No methods pass empirical thresholds, selecting the best method") + selected_methods = [min(heuristic_scores, key=heuristic_scores.get)] + + logger.info("Method(s) {} selected with the heuristic algorithm".format(", ".join(selected_methods))) + return selected_methods + + def generate_density_plot(self, number_stdevs_from_mean, pdf_filename=None): + """ + This method produces a PDF Density Plot plot overlaying the 4 inference methods part of the heuristic algorithm. + + Args: + number_stdevs_from_mean (dict): a dictionary of the number of standard deviations from the mean per + inference method for a range of FDRs. + pdf_filename (str): Filename to write heuristic density plot to. + + Returns: + dict: a dictionary of heuristic scores per inference method which correlates to the + maximum point of the density plot per inference method. + + """ + f = plt.figure() + + heuristic_scores = {} + for method in number_stdevs_from_mean: + readible_method_name = Inference.INFERENCE_NAME_MAP[method] + kwargs = dict(histtype='stepfilled', alpha=0.3, density=True, bins=40, ec="k", label=readible_method_name) + x, y, _ = plt.hist(number_stdevs_from_mean[method], **kwargs) + center = y[list(x).index(max(x))] + heuristic_scores[method] = abs(center) + + plt.axvline(0, color="black", linestyle='--', alpha=0.75) + plt.title("Density Plot of the Number of Standard Deviations from the Mean") + plt.xlabel('Number of Standard Deviations from the Mean') + plt.ylabel('Number of Observations') + plt.legend(loc='upper right') + if pdf_filename: + logger.info("Writing Heuristic Density plot to: {}".format(pdf_filename)) + f.savefig(pdf_filename) + else: + plt.show() + plt.close() + + logger.info("Heuristic Scores") + logger.info(heuristic_scores) + + return heuristic_scores + + def determine_number_stdev_from_mean(self, false_discovery_rate): + """ + This method calculates the mean of the number of proteins identified at a specific FDR of all + 4 methods and then for each method calculates the number of standard deviations + from the previous calculated mean. + + Args: + false_discovery_rate (float): The false discovery rate used as a cutoff for calculations. + + Returns: + dict: a dictionary of the number of standard deviations away from the mean per inference method. + + """ + + filtered_protein_objects = { + x: self.datastore_dict[x].get_protein_objects( + fdr_restricted=True, false_discovery_rate=false_discovery_rate + ) + for x in self.datastore_dict.keys() + } + number_passing_proteins = {x: len(filtered_protein_objects[x]) for x in filtered_protein_objects.keys()} + + # Calculate how similar the number of passing proteins is for each method + all_values = [x for x in number_passing_proteins.values()] + mean = numpy.mean(all_values) + standard_deviation = statistics.stdev(all_values) + number_stdev_from_mean_dict = {} + for key in number_passing_proteins.keys(): + cur_value = number_passing_proteins[key] + number_stdev_from_mean_dict[key] = (cur_value - mean) / standard_deviation + + return number_stdev_from_mean_dict diff --git a/pyproteininference/in_silico_digest.py b/pyproteininference/in_silico_digest.py index 4671dc4..4a0fdd5 100644 --- a/pyproteininference/in_silico_digest.py +++ b/pyproteininference/in_silico_digest.py @@ -16,30 +16,30 @@ class Digest(object): """ - The following class handles data storage of in silico digest data from a fasta formatted sequence database + The following class handles data storage of in silico digest data from a fasta formatted sequence database. Attributes: - peptide_to_protein_dictionary (dict): Dictionary of peptides (keys) to protein sets (values) - protein_to_peptide_dictionary (dict): Dictionary of proteins (keys) to peptide sets (values) + peptide_to_protein_dictionary (dict): Dictionary of peptides (keys) to protein sets (values). + protein_to_peptide_dictionary (dict): Dictionary of proteins (keys) to peptide sets (values). swiss_prot_protein_set (set): Set of reviewed proteins if they are able to be distinguished from unreviewed - proteins - database_path (str): Path to fasta database file to digest - missed_cleavages (int): The number of missed cleavages to allow + proteins. + database_path (str): Path to fasta database file to digest. + missed_cleavages (int): The number of missed cleavages to allow. id_splitting (bool): True/False on whether or not to split a given regex off identifiers. This is used to split of "sp|" and "tr|" from the database protein strings as sometimes the database will contain those strings while the input data will have the strings split already. - Keep as False unless you know what you are doing + Advanced usage only. reviewed_identifier_symbol (str/None): Identifier that distinguishes reviewed from unreviewed proteins. - Typically this is "sp|". Can also be None type - digest_type (str): can be any value in :attr:`LIST_OF_DIGEST_TYPES` + Typically this is "sp|". Can also be None type. + digest_type (str): can be any value in `LIST_OF_DIGEST_TYPES`. max_peptide_length (int): Max peptide length to keep for analysis. """ TRYPSIN = "trypsin" LYSC = "lysc" - LIST_OF_DIGEST_TYPES = [TRYPSIN, LYSC] + LIST_OF_DIGEST_TYPES = set(parser.expasy_rules.keys()) AA_LIST = [ "A", @@ -75,7 +75,7 @@ def __init__(self): class PyteomicsDigest(Digest): """ - This class represents a pyteomics implementation of an in silico digest + This class represents a pyteomics implementation of an in silico digest. """ def __init__( @@ -92,24 +92,22 @@ def __init__( The input is a fasta database, a protein inference parameter object, and whether or not to split IDs. - Further digestion types need to be added in the future other than just trypsin/lysc - - This class sets important attributes for the Digest object such as: :attr:`peptide_to_protein_dictionary`, - :attr:`protein_to_peptide_dictionary`, and :attr:`swiss_prot_protein_set` + This class sets important attributes for the Digest object such as: `peptide_to_protein_dictionary`, + `protein_to_peptide_dictionary`, and `swiss_prot_protein_set`. Args: - database_path (str): Path to fasta database file to digest - digest_type (str): Must be a value in :attr:`LIST_OF_DIGEST_TYPES` + database_path (str): Path to fasta database file to digest. + digest_type (str): Must be a value in `LIST_OF_DIGEST_TYPES`. missed_cleavages (int): Integer that indicates the maximum number of allowable missed cleavages from - the ms search + the ms search. reviewed_identifier_symbol (str/None): Symbol that indicates a reviewed identifier. - If using Uniprot this is typically 'sp|' - max_peptide_length (int): The maximum length of peptides to keep for the analysis + If using Uniprot this is typically 'sp|'. + max_peptide_length (int): The maximum length of peptides to keep for the analysis. id_splitting (bool): True/False on whether or not to split a given regex off identifiers. This is used to split of "sp|" and "tr|" from the database protein strings as sometimes the database will contain those strings while the input data will have the strings split already. - Keep as False unless you know what you are doing + Advanced usage only. Example: >>> digest = pyproteininference.in_silico_digest.PyteomicsDigest( @@ -139,11 +137,11 @@ def __init__( def digest_fasta_database(self): """ This method reads in and prepares the fasta database for database digestion and assigns - the several attributes for the Digest object: :attr:`peptide_to_protein_dictionary`, - :attr:`protein_to_peptide_dictionary`, and :attr:`swiss_prot_protein_set` + the several attributes for the Digest object: `peptide_to_protein_dictionary`, + `protein_to_peptide_dictionary`, and `swiss_prot_protein_set`. Returns: - None + None: Example: >>> digest = pyproteininference.in_silico_digest.PyteomicsDigest( @@ -177,7 +175,7 @@ def digest_fasta_database(self): # Handle ID Splitting... if self.id_splitting: identifier_stripped = self.UNIPROT_STR_REGEX.sub("", identifier) - if not self.id_splitting: + else: identifier_stripped = identifier # If reviewed add to sp_set diff --git a/pyproteininference/inference.py b/pyproteininference/inference.py index a5e5afe..c4e93e0 100644 --- a/pyproteininference/inference.py +++ b/pyproteininference/inference.py @@ -1,7 +1,4 @@ -import collections import logging -import os -import subprocess import sys from collections import OrderedDict @@ -26,8 +23,8 @@ class Inference(object): The base Inference class contains several methods that are shared across the Inference sub-classes. Attributes: - data (pyproteininference.datastore.DataStore): Data Class - digest (pyproteininference.in_silico_digest.Digest): Digest Class + data (DataStore): [DataStore object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest object][pyproteininference.in_silico_digest.Digest]. """ PARSIMONY = "parsimony" @@ -44,6 +41,14 @@ class Inference(object): PEPTIDE_CENTRIC, ] + INFERENCE_NAME_MAP = { + PARSIMONY: "Parsimony", + INCLUSION: "Inclusion", + EXCLUSION: "Exclusion", + FIRST_PROTEIN: "First Protein", + PEPTIDE_CENTRIC: "Peptide Centric", + } + SUBSET_PEPTIDES = "subset_peptides" SHARED_PEPTIDES = "shared_peptides" NONE_GROUPING = None @@ -51,8 +56,7 @@ class Inference(object): GROUPING_TYPES = [SUBSET_PEPTIDES, SHARED_PEPTIDES, NONE_GROUPING] PULP = "pulp" - GLPK = "glpk" - LP_SOLVERS = [PULP, GLPK] + LP_SOLVERS = [PULP] ALL_SHARED_PEPTIDES = "all" BEST_SHARED_PEPTIDES = "best" @@ -65,11 +69,11 @@ class Inference(object): def __init__(self, data, digest): """ - Initialization method of Inference object + Initialization method of Inference object. Args: - data (pyproteininference.datastore.DataStore): Data Class - digest (pyproteininference.in_silico_digest.Digest): Digest Class + data (DataStore): [DataStore object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest object][pyproteininference.in_silico_digest.Digest]. """ self.data = data @@ -80,9 +84,13 @@ def __init__(self, data, digest): def run_inference(cls, data, digest): """ This class method dispatches to one of the five different inference classes/models - based on input from the protein inference parameter object - :py:class:`pyproteininference.parameters.ProteinInferenceParameter`. - The methods are "parsimony", "inclusion", "exclusion", "peptide_centric", and "first_protein" + based on input from the [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] + object. + The methods are "parsimony", "inclusion", "exclusion", "peptide_centric", and "first_protein". + + Args: + data (DataStore): [DataStore object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest object][pyproteininference.in_silico_digest.Digest]. Example: >>> pyproteininference.inference.Inference.run_inference(data=data,digest=digest) @@ -93,7 +101,6 @@ def run_inference(cls, data, digest): logger.info("Running Inference with Inference Type: {}".format(inference_type)) - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == Inference.PARSIMONY: group = Parsimony(data=data, digest=digest) group.infer_proteins() @@ -116,13 +123,13 @@ def run_inference(cls, data, digest): def _create_protein_groups(self, scored_proteins): """ - This method sets up protein groups for inference methods that do not need grouping + This method sets up protein groups for inference methods that do not need grouping. Args: - scored_proteins (list): List of scored :py:class:`pyproteininference.physical.Protein` objects + scored_proteins (list): List of scored [Protein][pyproteininference.physical.Protein] objects. Returns: - list: List of lists of scored :py:class:`pyproteininference.physical.Protein` objects + list: List of lists of scored [Protein][pyproteininference.physical.Protein] objects. """ scored_proteins = sorted( @@ -151,15 +158,15 @@ def _create_protein_groups(self, scored_proteins): def _apply_protein_group_ids(self, grouped_protein_objects): """ This method creates the ProteinGroup objects from the output of - :py:meth:`pyproteininference.inference.Inference_create_protein_groups` + [_create_protein_groups][`pyproteininference.inference.Inference._create_protein_groups]. Args: - grouped_protein_objects (list): list of grouped :py:class:`pyproteininference.physical.Protein` objects + grouped_protein_objects (list): list of grouped [Protein][pyproteininference.physical.Protein] objects. Returns: - dict: a Dictionary that contains a list of :py:class:`pyproteininference.physical.ProteinGroup` - objects (key:"group_objects") and a list of grouped :py:class:`pyproteininference.physical.Protein` - objects (key:"grouped_protein_objects") + dict: a Dictionary that contains a list of [ProteinGroup][pyproteininference.physical.ProteinGroup] + objects (key:"group_objects") and a list of grouped [Protein][pyproteininference.physical.Protein] + objects (key:"grouped_protein_objects"). """ @@ -205,22 +212,22 @@ def _apply_protein_group_ids(self, grouped_protein_objects): class Inclusion(Inference): """ Inclusion Inference class. This class contains methods that support the initialization of an - Inclusion inference method + Inclusion inference method. Attributes: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object - scored_data (list): a List of scored Protein objects :py:class:`pyproteininference.physical.Protein` + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + scored_data (list): a List of scored [Protein][pyproteininference.physical.Protein] objects. """ def __init__(self, data, digest): """ - Initialization method of the Inclusion Inference method + Initialization method of the Inclusion Inference method. Args: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. """ self.data = data @@ -233,12 +240,12 @@ def infer_proteins(self): This method performs the grouping for Inclusion. Inclusion actually does not do grouping as all peptides get assigned to all possible proteins - and groups are not created + and groups are not created. - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` and - :py:class:`pyproteininference.physical.ProteinGroup` + This method assigns the variables: `grouped_scored_proteins` and `protein_group_objects`. + These are both variables of the [DataStore Object][pyproteininference.datastore.DataStore] and are + lists of [Protein][pyproteininference.physical.Protein] objects + and [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. """ grouped_proteins = self._create_protein_groups(scored_proteins=self.scored_data) @@ -268,15 +275,15 @@ def infer_proteins(self): def _apply_protein_group_ids(self, grouped_protein_objects): """ This method creates the ProteinGroup objects for the inclusion inference type using protein groups from - :py:meth:`pyproteininference.inference.Inference_create_protein_groups` + [_create_protein_groups][`pyproteininference.inference.Inference._create_protein_groups]. Args: - grouped_protein_objects (list): list of grouped :py:class:`pyproteininference.physical.Protein` objects + grouped_protein_objects (list): list of grouped [Protein][pyproteininference.physical.Protein] objects. Returns: - dict: a Dictionary that contains a list of :py:class:`pyproteininference.physical.ProteinGroup` - objects (key:"group_objects") and a list of grouped :py:class:`pyproteininference.physical.Protein` - objects (key:"grouped_protein_objects") + dict: a Dictionary that contains a list of [ProteinGroup][pyproteininference.physical.ProteinGroup] + objects (key:"group_objects") and a list of + grouped [Protein][pyproteininference.physical.Protein] objects (key:"grouped_protein_objects"). """ @@ -321,22 +328,22 @@ def _apply_protein_group_ids(self, grouped_protein_objects): class Exclusion(Inference): """ Exclusion Inference class. This class contains methods that support the initialization of an - Exclusion inference method + Exclusion inference method. Attributes: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object - scored_data (list): a List of scored Protein objects :py:class:`pyproteininference.physical.Protein` + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + scored_data (list): a List of scored [Protein][pyproteininference.physical.Protein] objects. """ def __init__(self, data, digest): """ - Initialization method of the Exclusion Class + Initialization method of the Exclusion Class. Args: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. """ self.data = data @@ -350,12 +357,12 @@ def infer_proteins(self): """ This method performs the Exclusion inference/grouping method. - For the exclusion inference method groups cannot be created because all shared peptides are removed + For the exclusion inference method groups cannot be created because all shared peptides are removed. - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` and - :py:class:`pyproteininference.physical.ProteinGroup` + This method assigns the variables: `grouped_scored_proteins` and `protein_group_objects`. + These are both variables of the [DataStore Object][pyproteininference.datastore.DataStore] and are + lists of [Protein][pyproteininference.physical.Protein] objects + and [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. """ @@ -386,23 +393,23 @@ def infer_proteins(self): class Parsimony(Inference): """ Parsimony Inference class. This class contains methods that support the initialization of a - Parsimony inference method + Parsimony inference method. Attributes: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object - scored_data (list): a List of scored Protein objects :py:class:`pyproteininference.physical.Protein` - lead_protein_set (set): Set of protein strings that are classified as leads from the LP solver + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + scored_data (list): a List of scored [Protein][pyproteininference.physical.Protein] objects. + lead_protein_set (set): Set of protein strings that are classified as leads from the LP solver. """ def __init__(self, data, digest): """ - Initialization method of the Parsimony object + Initialization method of the Parsimony object. Args: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. """ self.data = data self.digest = digest @@ -418,18 +425,18 @@ def _create_protein_groups( grouping_type="shared_peptides", ): """ - Internal method that creates a list of lists of :py:class:`pyproteininference.physical.Protein` - objects for the Parsimony inference object - These list of lists are "groups" and the proteins get grouped them according to grouping_type variable + Internal method that creates a list of lists of [Protein][pyproteininference.physical.Protein] + objects for the Parsimony inference object. + These list of lists are "groups" and the proteins get grouped them according to grouping_type variable. Args: - all_scored_proteins (list): list of :py:class:`pyproteininference.physical.Protein` objects - lead_protein_objects (list): list of :py:class:`pyproteininference.physical.Protein` objects. - Only needed if inference_type=parsimony - grouping_type: (str): One of :attr:`GROUPING_TYPES` + all_scored_proteins (list): list of [Protein][pyproteininference.physical.Protein] objects. + lead_protein_objects (list): list of [Protein][pyproteininference.physical.Protein] objects + Only needed if inference_type=parsimony. + grouping_type: (str): One of `GROUPING_TYPES`. Returns: - list: list of lists of :py:class:`pyproteininference.physical.Protein` objects + list: list of lists of [Protein][pyproteininference.physical.Protein] objects. """ @@ -458,7 +465,7 @@ def _create_protein_groups( try: picked_removed = set([x.identifier for x in self.data.picked_proteins_removed]) except TypeError: - picked_removed = set([]) + picked_removed = set() missing_proteins = set() in_silico_peptides_to_proteins = self.digest.peptide_to_protein_dictionary @@ -564,19 +571,19 @@ def _swissprot_and_isoform_override( """ This internal method creates and reorders protein groups based on criteria such as Reviewed/Unreviewed Identifiers as well as Canonincal/Isoform Identifiers. - This method is only used with parsimony inference type + This method is only used with parsimony inference type. Args: - scored_data (list): list of scored :py:class:`pyproteininference.physical.Protein` objects - grouped_proteins: list of grouped :py:class:`pyproteininference.physical.Protein` objects + scored_data (list): list of scored [Protein][pyproteininference.physical.Protein] objects. + grouped_proteins: list of grouped [Protein][pyproteininference.physical.Protein] objects. override_type (str): "soft" or "hard" to indicate Reviewed/Unreviewed override. "soft" is preferred and default. - isoform_override (bool): True/False on whether to favor canonical forms vs isoforms as group leads + isoform_override (bool): True/False on whether to favor canonical forms vs isoforms as group leads. Returns: - dict: a Dictionary that contains a list of :py:class:`pyproteininference.physical.ProteinGroup` objects - (key:"group_objects") and a list of grouped :py:class:`pyproteininference.physical.Protein` - objects (key:"grouped_protein_objects") + dict: a Dictionary that contains a list of [ProteinGroup][pyproteininference.physical.ProteinGroup] objects + (key:"group_objects") and a list of grouped [Protein][pyproteininference.physical.Protein] + objects (key:"grouped_protein_objects"). """ @@ -671,21 +678,21 @@ def _swissprot_and_isoform_override( def _swissprot_override(self, protein_list, leads, grouped_protein_objects, override_type): """ This method re-assigns protein group leads if the lead is an unreviewed protein and if the protein group - contains a reviewed protein that contains the exact same set of peptides as the unreviewed lead - This method is here to provide consistency to the output + contains a reviewed protein that contains the exact same set of peptides as the unreviewed lead. + This method is here to provide consistency to the output. Args: - protein_list (list): List of grouped :py:class:`pyproteininference.physical.Protein` objects - leads (set): Set of string protien identifiers that have been identified as a lead - grouped_protein_objects (list): List of protein_list lists - override_type (str): "soft" or "hard" on how to override non reviewed identifiers. "soft" is preferred + protein_list (list): List of grouped [Protein][pyproteininference.physical.Protein] objects. + leads (set): Set of string protein identifiers that have been identified as a lead. + grouped_protein_objects (list): List of protein_list lists. + override_type (str): "soft" or "hard" on how to override non reviewed identifiers. "soft" is preferred. Returns: - dict: leads (set): Set of string protien identifiers that have been identified as a lead. - Updated to reflect lead changes - grouped_protein_objects (list): List of protein_list lists. Updated to reflect lead changes - protein_list (list): List of grouped :py:class:`pyproteininference.physical.Protein` objects. - Updated to reflect lead changes + dict: leads (set): Set of string protein identifiers that have been identified as a lead. + Updated to reflect lead changes. + grouped_protein_objects (list): List of protein_list lists. Updated to reflect lead changes. + protein_list (list): List of grouped [Protein][pyproteininference.physical.Protein] objects. + Updated to reflect lead changes. """ @@ -783,50 +790,49 @@ def _isoform_override(self, protein_list, grouped_protein_objects, leads): """ This method re-assigns protein group leads if the lead is an isoform protein and if the protein group contains a canonical protein that contains the exact same set of peptides as the isoform lead. - This method is here to provide consistency to the output + This method is here to provide consistency to the output. Args: - protein_list (list): List of grouped :py:class:`pyproteininference.physical.Protein` objects - leads (set): Set of string protien identifiers that have been identified as a lead - grouped_protein_objects (list): List of protein_list lists + protein_list (list): List of grouped [Protein][pyproteininference.physical.Protein] objects. + leads (set): Set of string protein identifiers that have been identified as a lead. + grouped_protein_objects (list): List of protein_list lists. Returns: - dict: leads (set): Set of string protien identifiers that have been identified as a lead. Updated to - reflect lead changes - grouped_protein_objects (list): List of protein_list lists. Updated to reflect lead changes - protein_list (list): List of grouped :py:class:`pyproteininference.physical.Protein` objects. - Updated to reflect lead changes + dict: leads (set): Set of string protein identifiers that have been identified as a lead. Updated to + reflect lead changes. + grouped_protein_objects (list): List of protein_list lists. Updated to reflect lead changes. + protein_list (list): List of grouped [Protein][pyproteininference.physical.Protein] objects. + Updated to reflect lead changes. """ - if protein_list[0].reviewed: - if self.data.parameter_file_object.isoform_symbol in protein_list[0].identifier: - pure_id = protein_list[0].identifier.split(self.data.parameter_file_object.isoform_symbol)[0] - # Start to loop through protein_list which is the current group... - for potential_replacement in protein_list[1:]: - isoform_override = potential_replacement - if ( - isoform_override.identifier == pure_id - and isoform_override.identifier not in leads - and set(protein_list[0].peptides).issubset(set(isoform_override.peptides)) - ): - isoform_override_index = grouped_protein_objects[-1].index(isoform_override) - cur_iso_lead = grouped_protein_objects[-1][0] - # Re-assigning the value within the index will also reassign the value in protein_list... - # This is because grouped_protein_objects[-1] equals protein_list - # So we do not have to reassign values in protein_list - (grouped_protein_objects[-1][0], grouped_protein_objects[-1][isoform_override_index],) = ( - grouped_protein_objects[-1][isoform_override_index], - grouped_protein_objects[-1][0], - ) - grouped_protein_objects[-1][isoform_override_index], grouped_protein_objects[-1][0] - - new_iso_lead = grouped_protein_objects[-1][0] - logger.info( - "Overriding Isoform {} with {}".format(cur_iso_lead.identifier, new_iso_lead.identifier) - ) - leads.add(protein_list[0].identifier) + if self.data.parameter_file_object.isoform_symbol in protein_list[0].identifier: + pure_id = protein_list[0].identifier.split(self.data.parameter_file_object.isoform_symbol)[0] + # Start to loop through protein_list which is the current group... + for potential_replacement in protein_list[1:]: + isoform_override = potential_replacement + if ( + isoform_override.identifier == pure_id + and isoform_override.identifier not in leads + and set(protein_list[0].peptides).issubset(set(isoform_override.peptides)) + ): + isoform_override_index = grouped_protein_objects[-1].index(isoform_override) + cur_iso_lead = grouped_protein_objects[-1][0] + # Re-assigning the value within the index will also reassign the value in protein_list... + # This is because grouped_protein_objects[-1] equals protein_list + # So we do not have to reassign values in protein_list + (grouped_protein_objects[-1][0], grouped_protein_objects[-1][isoform_override_index],) = ( + grouped_protein_objects[-1][isoform_override_index], + grouped_protein_objects[-1][0], + ) + grouped_protein_objects[-1][isoform_override_index], grouped_protein_objects[-1][0] + + new_iso_lead = grouped_protein_objects[-1][0] + logger.info( + "Overriding Isoform {} with {}".format(cur_iso_lead.identifier, new_iso_lead.identifier) + ) + leads.add(protein_list[0].identifier) return_dict = { "leads": leads, @@ -839,14 +845,14 @@ def _isoform_override(self, protein_list, grouped_protein_objects, leads): def _reassign_protein_group_leads(self, protein_group_objects): """ This internal method corrects leads that are improperly assigned in the parsimony inference method. - This method acts on the protein group objects + This method acts on the protein group objects. Args: - protein_group_objects (list): List of :py:class:`pyproteininference.physical.ProteinGroup` objects + protein_group_objects (list): List of [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. Returns: - list: List of :py:class:`pyproteininference.physical.ProteinGroup` objects where leads have been - reassigned properly + protein_group_objects (list): List of [ProteinGroup][pyproteininference.physical.ProteinGroup] objects + where leads have been reassigned properly. """ @@ -863,7 +869,7 @@ def _reassign_protein_group_leads(self, protein_group_objects): # protein c maps to a bunch of peptides and peptide 3 # Therefore, in the model proteins a and b are equivalent in that they map to 2 peptides together - 1 and 2. # peptide 3 maps to a but also to c... - # Sometimes the model (glpk) will spit out protein b as the lead... we wish to swap protein b as the lead with + # Sometimes the model (pulp) will spit out protein b as the lead... we wish to swap protein b as the lead with # protein a because it will likely have a better score... logger.info("Potentially Reassigning Protein Group leads...") lead_protein_set = set([x.proteins[0].identifier for x in protein_group_objects]) @@ -897,7 +903,6 @@ def _reassign_protein_group_leads(self, protein_group_objects): protein_group_objects[i].proteins[0], protein_group_objects[i].proteins[j], ) = (new_lead, old_lead) - logger.info(j) break if higher_or_lower == datastore.DataStore.LOWER_PSM_SCORE: @@ -931,14 +936,14 @@ def _reassign_protein_group_leads(self, protein_group_objects): def _reassign_protein_list_leads(self, grouped_protein_objects): """ This internal method corrects leads that are improperly assigned in the parsimony inference method. - This method acts on the grouped protein objects + This method acts on the grouped protein objects. Args: - grouped_protein_objects (list): List of :py:class:`pyproteininference.physical.Protein` objects + grouped_protein_objects (list): List of [Protein][pyproteininference.physical.Protein] objects. Returns: - list: List of :py:class:`pyproteininference.physical.Protein` objects where leads have been - reassigned properly + list: List of [Protein][pyproteininference.physical.Protein] objects where leads have been + reassigned properly. """ @@ -955,7 +960,7 @@ def _reassign_protein_list_leads(self, grouped_protein_objects): # protein c maps to a bunch of peptides and peptide 3 # Therefore, in the model proteins a and b are equivalent in that they map to 2 peptides together - 1 and 2. # peptide 3 maps to a but also to c... - # Sometimes the model (glpk) will spit out protein b as the lead... we wish to swap protein b as the lead with + # Sometimes the model (pulp) will spit out protein b as the lead... we wish to swap protein b as the lead with # protein a because it will likely have a better score... logger.info("Potentially Reassigning Proteoin List leads...") lead_protein_set = set([x[0].identifier for x in grouped_protein_objects]) @@ -989,7 +994,6 @@ def _reassign_protein_list_leads(self, grouped_protein_objects): grouped_protein_objects[i][0], grouped_protein_objects[i][j], ) = (new_lead, old_lead) - logger.info(j) break if higher_or_lower == datastore.DataStore.LOWER_PSM_SCORE: @@ -1020,349 +1024,15 @@ def _reassign_protein_list_leads(self, grouped_protein_objects): return grouped_protein_objects - def _setup_glpk(self, glpkin_filename="glpkin.mod"): - """ - This internal method is used to setup the glpk file for lp analysis. - - The Bulk of the glpk input file looks as follows: - s.t. c1: y[5658] >=1; - s.t. c2: y[14145]+y[4857]+y[4858]+y[10143]+y[2966] >=1; - s.t. c3: y[320]+y[4893]+y[4209]+y[911]+y[2767]+y[2296]+y[10678]+y[3545] >=1 - - Where each new line is a peptide and each y[x] is a protein that maps to said peptide - - Args: - glpkin_filename (str): path to the filename to be used by glpsol - - Returns: - None: - """ - - # Here we get the peptide to protein dictionary - pep_prot_dict = self.data.peptide_to_protein_dictionary() - - # Here we get the protein to peptide dictionary... - self.data.protein_to_peptide_dictionary() - - identifiers_sorted = self.data.get_sorted_identifiers(scored=True) - - # Get all the proteins that we scored and the ones picked if picker was ran... - data_proteins = sorted([x for x in self.data.protein_peptide_dictionary.keys() if x in identifiers_sorted]) - # Get the set of peptides for each protein... - data_peptides = [set(self.data.protein_peptide_dictionary[x]) for x in data_proteins] - flat_peptides_in_data = set([item for sublist in data_peptides for item in sublist]) - peptide_sets = [] - # Loop over the list of peptides... - for k in range(len(data_peptides)): - raw_peptides = data_peptides[k] - peptide_set = set() - # Loop over each individual peptide per protein... - for peps in raw_peptides: - # Remove mods... - new_peptide = Psm.remove_peptide_mods(peps) - # Add it to a temporary set... - peptide_set.add(new_peptide) - # Append this set to a new list... - peptide_sets.append(peptide_set) - # Set that proteins peptides to be the unmodified ones... - data_peptides[k] = peptide_set - - # Get them all... - all_peptides = [x for x in data_peptides] - # Remove redundant sets... - non_redundant_peptide_sets = [set(i) for i in OrderedDict.fromkeys(frozenset(item) for item in peptide_sets)] - - # Loop over the restricted list of peptides... - ind_list = [] - for peptide_set in non_redundant_peptide_sets: - # Get its index in terms of the overall list... - ind_list.append(all_peptides.index(peptide_set)) - - # Get the protein based on the index - restricted_proteins = [data_proteins[x] for x in range(len(data_peptides)) if x in ind_list] - - # Here we get the list of all proteins - plist = [] - for peps in pep_prot_dict.keys(): - for prots in list(pep_prot_dict[peps]): - if prots in restricted_proteins and peps in flat_peptides_in_data: - plist.append(prots) - - # Here we get the unique proteins - unique_prots = list(set(plist).union()) - unique_protein_set = set(unique_prots) - - unique_prots_sorted = [x for x in identifiers_sorted if x in unique_prots] - - if len(unique_prots) != len(unique_prots_sorted): - raise ValueError("Sorted proteins length is not equal to unsorted length...") - - # Setup default dictionaries - dd_num = collections.defaultdict(list) - dd_prot_nums = collections.defaultdict(list) - - # For all the unique proteins from the search create a number to protein dictionary and a protein to - # number dictionary - # Here we essentially assign a number to each protein - # This is important as the glpk analysis just treats proteins as numbers... - for peptide_set in range(len(unique_prots_sorted)): - dd_num[unique_prots_sorted[peptide_set]].append(peptide_set) - dd_prot_nums[peptide_set].append(unique_prots_sorted[peptide_set]) - - # Store this data as glpk_number_protein_dictionary - # The numbers are important as they are used in the GLPK input and we need to know what number in the GLPK - # output corresponds with which protein from the search - self.glpk_number_protein_dictionary = dd_prot_nums - # Create the GLPK input file - fileout = open(glpkin_filename, "w") - - # Not sure if this header string is correct or if it needs to be here... - fileout.write("/* sets */" + "\n" + "set PROTEINS;" + "\n" + "\n" + "\n") - fileout.write("/* decision variables: yi, i in {1,..,5}. yi = 1 -> protein i is selected */" + "\n") - fileout.write("var y {i in PROTEINS} binary >=0;" + "\n") - fileout.write("/* objective function */" + "\n") - fileout.write("minimize z: sum{i in PROTEINS} y[i];" + "\n" + "\n") - fileout.write("/* Constraints */" + "\n") - - # Here we create the bulk of the input file which needs to look as follows: - # s.t. c1: y[5658] >=1; - # s.t. c2: y[14145]+y[4857]+y[4858]+y[10143]+y[2966] >=1; - # s.t. c3: y[320]+y[4893]+y[4209]+y[911]+y[2767]+y[2296]+y[10678]+y[3545] >=1; - # Each of the lines (constants, c1,c2,c3) is a peptide and each of the y[x] is a protein - total_sorted_peptides = sorted(list(set(flat_peptides_in_data))) # Sort peptides alphabetically first... - for j in range(len(total_sorted_peptides)): - peptides_glpsol_format = [ - "y[{}]".format(dd_num[x][0]) - for x in sorted(pep_prot_dict[total_sorted_peptides[j]]) - if x in unique_protein_set - ] - fileout.write("s.t. c" + str(j + 1) + ": " + "+".join(peptides_glpsol_format) + " >=1;" + "\n") - - # Finish writing the rest of the file and close it - fileout.write("\n") - fileout.write("data;" + "\n") - numlist = [str(dd_num[x][0]) for x in sorted(unique_prots)] - strlist = " ".join(numlist) - # End the file with listing the entire set of proteins... (as its number identifier) - fileout.write("set PROTEINS := " + strlist + " ;" + "\n" + "\n") - - fileout.write("end;") - fileout.close() - - def _glpk_runner( - self, - path_to_glpsol="glpsol", - glpkin="glpkin.mod", - glpkout="glpkout.sol", - skip_running=False, - ): - """ - This internal method handles running glpsol on the commandline to solve the linear programming problem for - parsimony - - Args: - path_to_glpsol (str): Path to glpsol on the system - glpkin (str): Path to the input file for glpsol - glpkout (str): Path to the output file for glpsol to write - skip_running (bool): True/False on skipping running glpsol - - Returns: - None - """ - - # If there is no file_override (mainly for offline testing) - # Run GLPK with the following command - if not skip_running: - p = subprocess.Popen( - "{} -m {} -o {}".format(path_to_glpsol, glpkin, glpkout), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - ) - output = p.communicate() - - logger.debug("Start Command line Stdout") - logger.debug(output[0]) - logger.debug("End Command line Stdout") - logger.debug("Start Command line Stderr") - logger.debug(output[1]) - logger.debug("End Command line Stderr") - - if output[0] == "": - raise ValueError("Glpk did not produce any output... See potential error output above") - else: - logger.info("Not running GLPK, File {} will be used downstream in grouping".format(glpkout)) - - # Define indicies better and make this code more readable... - # Define indicies in init and show commented examples of how the data looks... - - def _glpk_grouper( - self, - swissprot_override="soft", - glpksolution_filename="glpkout.sol", - ): - """ - This internal functions takes the output from glpsol and translates that into lead protein strings and then - finally lead protein objects. - - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` and - :py:class:`pyproteininference.physical.ProteinGroup` - - The subsequent lead proteins then have proteins assigned to them through grouping - - Finally, swissprot_override is ran. swissprot override is a lead protein override for naming convention which - can have 2 options: "soft", or "hard". - - selecting "soft" will cycle through all protein leads and groups. If a protein lead is an unreviewed hit - (trembl) then all proteins in the lead proteins group are inspected. - If any of these intra group proteins are a reviewed hit (swissprot) and the lead trembl proteins peptides - are a complete subset of the swissprot proteins peptides, then we select - to swap the trembl and swissprot hits so that the swissprot is now the new lead protein. - - We opt to use this soft override scheme as default because given the redundancy of our databases. - Out of GLPK we see a lot of unreviewed hits being called lead proteins. - If proteins share the same peptides or share a very similar set of peptides we are unsure how GLPK selects - which protein to supply as the lead. - As such, we get many reviewed and unreviewed proteins as lead proteins. - Performing this "soft" override switches many of these lead trembl hits to reviewed swissprot hits. - This is important as group members are used to seeing swissprot identifiers as opposed to trembl identifiers - - selecting "hard" will perform the same swapping as the "soft" override. However, - the "hard" override will swap a trembl lead with the swissprot protein with the highest number of peptides in - its group (so long as its already not a lead protein itself) - even if the unreviewed has a unique peptide relative to the peptides of all proteins in its group. - This setting is not recommended given that you can potentially lose out on Unique peptides - - Args: - swissprot_override (str): "soft" or "hard". Should be kept as "soft" but see the docs for an explanation - glpksolution_filename(str): Path to the output of glpsol - - """ - - scored_data = self.data.get_protein_data() - - glpk_out = open(glpksolution_filename, "r") - - # Get the number protein dictionary from glpk_setup - dd_prot_nums = self.glpk_number_protein_dictionary - - glpk_out = glpk_out.read() - glpk_out = glpk_out.split("\n") - - # Cant find a better way to do this... there are modules out there that work with glpk... - start = glpk_out.index(" No. Column name Activity Lower bound Upper bound") - - restricted_glpk_out = [] - # Fix this -13 and +2... not really sure how - # Based on the output file we should start two lines after the start index and stop 13 before the end of the - # file... - for lines in range(start + 2, len(glpk_out) - 13): - split_lines = [x.strip() for x in glpk_out[lines].split(" ")] - restricted_content = [] - for content in split_lines: - if content != "": - restricted_content.append(content) - restricted_glpk_out.append(restricted_content) - - # Use 1 here as 1 is the location in the line of the protein number - # 3 is the location of the binary (which indicates whether or not the protein number has a unique peptide - # making it a lead protein) - numbers = [x[1].split("]")[0].split("[")[-1] for x in restricted_glpk_out] - binary = [x[3] for x in restricted_glpk_out] - - self.numbers = numbers - - # Here we extract out the lead proteins... - lead_proteins = [] - for k in range(len(numbers)): - if binary[k] == "1": - try: - passing_protein_number = int(numbers[k]) - lead_proteins.append(dd_prot_nums[passing_protein_number][0]) - except IndexError: - logger.warning("No Protein for Protein Number {}".format(passing_protein_number)) - - lead_protein_set = set(lead_proteins) - self.lead_protein_set = lead_protein_set - - logger.info("Number of lead proteins = {}".format(len(lead_proteins))) - - scored_proteins = list(scored_data) - protein_finder = [x.identifier for x in scored_proteins] - - lead_protein_objects = [] - lead_protein_identifiers = [] - for proteins in lead_proteins: - if proteins in protein_finder: - p_ind = protein_finder.index(proteins) - protein_object = scored_proteins[p_ind] - lead_protein_objects.append(protein_object) - lead_protein_identifiers.append(protein_object.identifier) - else: - # Why are some proteins not being found when we run exclusion??? - logger.warning("Protein {} not found with protein finder...".format(proteins)) - - self.lead_protein_objects = lead_protein_objects - - # Now we have the lead Proteins so we need to get the peptides for each lead protein - # Then we find all proteins that share at least 1 peptide with each lead protein - # If they share at least 1 peptide then assign that protein to the group... - - grouped_proteins = self._create_protein_groups( - all_scored_proteins=scored_data, - lead_protein_objects=self.lead_protein_objects, - grouping_type=self.data.parameter_file_object.grouping_type, - ) - - regrouped_proteins = self._swissprot_and_isoform_override( - scored_data=scored_data, - grouped_proteins=grouped_proteins, - override_type=swissprot_override, - isoform_override=True, - ) - - grouped_protein_objects = regrouped_proteins["grouped_protein_objects"] - protein_group_objects = regrouped_proteins["group_objects"] - - hl = self.data.higher_or_lower() - - logger.info("Sorting Results based on lead Protein Score") - grouped_protein_objects = datastore.DataStore.sort_protein_objects( - grouped_protein_objects=grouped_protein_objects, higher_or_lower=hl - ) - protein_group_objects = datastore.DataStore.sort_protein_group_objects( - protein_group_objects=protein_group_objects, higher_or_lower=hl - ) - - # Run lead reassignment for the group objets and protein objects - protein_group_objects = self._reassign_protein_group_leads( - protein_group_objects=protein_group_objects, - ) - - grouped_protein_objects = self._reassign_protein_list_leads(grouped_protein_objects=grouped_protein_objects) - - logger.info("Re Sorting Results based on lead Protein Score") - grouped_protein_objects = datastore.DataStore.sort_protein_objects( - grouped_protein_objects=grouped_protein_objects, higher_or_lower=hl - ) - protein_group_objects = datastore.DataStore.sort_protein_group_objects( - protein_group_objects=protein_group_objects, higher_or_lower=hl - ) - - self.data.grouped_scored_proteins = grouped_protein_objects - self.data.protein_group_objects = protein_group_objects - def _pulp_grouper(self): """ This internal function uses pulp to solve the lp problem for parsimony then performs protein grouping with the - various internal grouping functions + various internal grouping functions. - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` - and :py:class:`pyproteininference.physical.ProteinGroup` + This method assigns the variables: `grouped_scored_proteins` and `protein_group_objects`. + These are both variables of the [DataStore object][pyproteininference.datastore.DataStore] and are + lists of [Protein][pyproteininference.physical.Protein] objects + and [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. """ @@ -1425,7 +1095,7 @@ def _pulp_grouper(self): unique_prots_sorted = [x for x in identifiers_sorted if x in unique_prots] # Define the protein variables with a lower bound of 0 and catgeory Integer - prots = pulp.LpVariable.dicts("prot", indexs=unique_prots_sorted, lowBound=0, cat="Integer") + prots = pulp.LpVariable.dicts("prot", indices=unique_prots_sorted, lowBound=0, cat="Integer") # Define our Lp Problem which is to Minimize our objective function prob = pulp.LpProblem("Parsimony_Problem", pulp.LpMinimize) @@ -1518,63 +1188,21 @@ def _pulp_grouper(self): self.data.grouped_scored_proteins = grouped_protein_objects self.data.protein_group_objects = protein_group_objects - def infer_proteins(self, glpkinout_directory="glpkinout", skip_running_glpk=False): + def infer_proteins(self): """ - This method performs the Parsimony inference method and either uses pulp or glpk based on the - :py:class:`pyproteininference.parameters.ProteinInferenceParameter` object - - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` and - :py:class:`pyproteininference.physical.ProteinGroup` + This method performs the Parsimony inference method and uses pulp for the LP solver. - Args: - glpkinout_directory (str): Directory to use for writing glpsol files. Only used if lp_solver is glpk - skip_running_glpk (bool): True/False for skipping the running of glpk. Only used if lp_solver is glpk + This method assigns the variables: `grouped_scored_proteins` and `protein_group_objects`. + These are both variables of the [DataStore object][pyproteininference.datastore.DataStore] and are + lists of [Protein][pyproteininference.physical.Protein] objects + and [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. - Returns: - None: """ if self.parameter_file_object.lp_solver == self.PULP: self._pulp_grouper() - elif self.parameter_file_object.lp_solver == self.GLPK: - - try: - os.mkdir(glpkinout_directory) - except OSError: - logger.warning("Directory {} cannot be created or already exists".format(glpkinout_directory)) - - self._setup_glpk( - glpkin_filename=os.path.join( - glpkinout_directory, - "glpkin_{}.mod".format(self.parameter_file_object.tag), - ) - ) - - self._glpk_runner( - path_to_glpsol=self.parameter_file_object.glpk_path, - glpkin=os.path.join( - glpkinout_directory, - "glpkin_{}.mod".format(self.parameter_file_object.tag), - ), - glpkout=os.path.join( - glpkinout_directory, - "glpkout_{}.sol".format(self.parameter_file_object.tag), - ), - skip_running=skip_running_glpk, - ) - - self._glpk_grouper( - swissprot_override="soft", - glpksolution_filename=os.path.join( - glpkinout_directory, - "glpkout_{}.sol".format(self.parameter_file_object.tag), - ), - ) - else: raise ValueError( "Parsimony cannot run if lp_solver parameter value is not one of the following: {}".format( @@ -1647,21 +1275,21 @@ def _assign_shared_peptides(self, shared_pep_type="all"): class FirstProtein(Inference): """ FirstProtein Inference class. This class contains methods that support the initialization of a - FirstProtein inference method + FirstProtein inference method. Attributes: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. """ def __init__(self, data, digest): """ - FirstProtein Inference initialization method + FirstProtein Inference initialization method. Args: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. Returns: object: @@ -1674,12 +1302,12 @@ def __init__(self, data, digest): def infer_proteins(self): """ - This method performs the First Protein inference method + This method performs the First Protein inference method. - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` and - :py:class:`pyproteininference.physical.ProteinGroup` + This method assigns the variables: `grouped_scored_proteins` and `protein_group_objects`. + These are both variables of the [DataStore object][pyproteininference.datastore.DataStore] and are + lists of [Protein][pyproteininference.physical.Protein] objects + and [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. """ @@ -1711,21 +1339,21 @@ def infer_proteins(self): class PeptideCentric(Inference): """ PeptideCentric Inference class. This class contains methods that support the initialization of a - PeptideCentric inference method + PeptideCentric inference method. Attributes: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. """ def __init__(self, data, digest): """ - PeptideCentric Inference initialization method + PeptideCentric Inference initialization method. Args: - data (pyproteininference.datastore.DataStore): Data Object - digest (pyproteininference.in_silico_digest.Digest): Digest Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. Returns: object: @@ -1737,15 +1365,15 @@ def __init__(self, data, digest): def infer_proteins(self): """ - This method performs the Peptide Centric inference method + This method performs the Peptide Centric inference method. - This method assigns the variables: :attr:`grouped_scored_proteins` and :attr:`protein_group_objects` - These are both variables of the :py:class:`pyproteininference.datastore.DataStore` and are - lists of :py:class:`pyproteininference.physical.Protein` and - :py:class:`pyproteininference.physical.ProteinGroup` + This method assigns the variables: `grouped_scored_proteins` and `protein_group_objects`. + These are both variables of the [DataStore object][pyproteininference.datastore.DataStore] and are + lists of [Protein][pyproteininference.physical.Protein] objects + and [ProteinGroup][pyproteininference.physical.ProteinGroup] objects. Returns: - None + None: """ @@ -1772,12 +1400,12 @@ def infer_proteins(self): def _apply_protein_group_ids(self): """ This method creates the ProteinGroup objects for the peptide_centric inference based on protein groups - from :py:meth:`pyproteininference.inference.Inference_create_protein_groups` + from [._create_protein_groups][pyproteininference.inference.Inference._create_protein_groups]. Returns: - dict: a Dictionary that contains a list of :py:class:`pyproteininference.physical.ProteinGroup` - objects (key:"group_objects") and a list of grouped :py:class:`pyproteininference.physical.Protein` - objects (key:"grouped_protein_objects") + dict: a Dictionary that contains a list of [ProteinGroup]]pyproteininference.physical.ProteinGroup] + objects (key:"group_objects") and a list of grouped [Protein]]pyproteininference.physical.Protein] + objects (key:"grouped_protein_objects"). """ diff --git a/pyproteininference/parameters.py b/pyproteininference/parameters.py index e8a78d4..1a98267 100644 --- a/pyproteininference/parameters.py +++ b/pyproteininference/parameters.py @@ -20,50 +20,50 @@ class ProteinInferenceParameter(object): """ - Class that handles data retrieval, storage, and validation of Protein Inference Parameters + Class that handles data retrieval, storage, and validation of Protein Inference Parameters. Attributes: - yaml_param_filepath (str): path to properly formatted parameter file specific to Protein Inference - digest_type (str): String that determines that type of in silico digestion for - :py:class:`pyproteininference.in_silico_digest.Digest`. Typically "trypsin" - export (str): String to indicate the export type for :py:class:`pyproteininference.export.Export`. - Typically this is "psms", "peptides", or "psm_ids" - fdr (float): Float to indicate FDR filtering - glpk_path (str): Path to local installation of glpsol if inference_type="parsimony" and lp_solver="glpk" + yaml_param_filepath (str): path to properly formatted parameter file specific to Protein Inference. + digest_type (str): String that determines that type of digestion in silico digestion for + [Digest object][pyproteininference.in_silico_digest.Digest]. Typically "trypsin". + export (str): String to indicate the export type for [Export object][pyproteininference.export.Export]. + Typically this is "psms", "peptides", or "psm_ids". + fdr (float): Float to indicate FDR filtering. missed_cleavages (int): Integer to determine the number of missed cleavages in the database digestion - :py:class:`pyproteininference.in_silico_digest.Digest` - picker (bool): True/False on whether or not to run the protein picker algorithm - :py:meth:pyproteininference.datastore.DataStore.protein_picker` + [Digest object][pyproteininference.in_silico_digest.Digest]. + picker (bool): True/False on whether or not to run + the [protein picker][pyproteininference.datastore.DataStore.protein_picker] algorithm. restrict_pep (float/None): Float to restrict the posterior error probability values by in the PSM input. - Used in :py:meth:pyproteininference.datastore.DataStore.restrict_psm_data` + Used in [restrict_psm_data][pyproteininference.datastore.DataStore.restrict_psm_data]. restrict_peptide_length (int/None): Float to restrict the peptide length values by in the PSM input. - Used in :py:meth:pyproteininference.datastore.DataStore.restrict_psm_data` + Used in [restrict_psm_data][pyproteininference.datastore.DataStore.restrict_psm_data]. restrict_q (float/None): Float to restrict the q values by in the PSM input. - Used in :py:meth:pyproteininference.datastore.DataStore.restrict_psm_data` + Used in [restrict_psm_data][pyproteininference.datastore.DataStore.restrict_psm_data]. restrict_custom (float/None): Float to restrict the custom values by in the PSM input. - Used in :py:meth:pyproteininference.datastore.DataStore.restrict_psm_data`. + Used in [restrict_psm_data][pyproteininference.datastore.DataStore.restrict_psm_data]. Filtering depends on score_type variable. If score_type is multiplicative then values that are less than restrict_custom are kept. If score_type is additive then values that are more than restrict_custom are kept. protein_score (str): String to determine the way in which Proteins are scored can be any of the SCORE_METHODS - in :py:class:`pyproteininference.scoring.Score` + in [Score object][pyproteininference.scoring.Score]. psm_score_type (str): String to determine the type of score that the PSM scores are - (Additive or Multiplicative) can be any of the SCORE_TYPES in :py:class:`pyproteininference.scoring.Score` - decoy_symbol (str): String to denote decoy proteins from target proteins. IE "##" - isoform_symbol (str): String to denote isoforms from regular proteins. IE "-". Can also be None + (Additive or Multiplicative) can be any of the SCORE_TYPES + in [Score object][pyproteininference.scoring.Score]. + decoy_symbol (str): String to denote decoy proteins from target proteins. IE "##". + isoform_symbol (str): String to denote isoforms from regular proteins. IE "-". Can also be None. reviewed_identifier_symbol (str): String to denote a "Reviewed" Protein. Typically this is: "sp|" - if using Uniprot Fasta database + if using Uniprot Fasta database. inference_type (str): String to determine the inference procedure. Can be any value of INFERENCE_TYPES - of :py:class:`pyproteininference.inference.Inference` object - tag (str): String to be added to output files + of [Inference object][pyproteininference.inference.Inference]. + tag (str): String to be added to output files. psm_score (str): String that indicates the PSM input score. The value should match the string in the input data of the score you want to use for PSM score. This score will be used in scoring methods - here: :py:class:`pyproteininference.scoring.Score` + here: [Score object][pyproteininference.scoring.Score]. grouping_type (str/None): String to determine the grouping procedure. Can be any value of - GROUPING_TYPES of :py:class:`pyproteininference.inference.Inference` object + GROUPING_TYPES of [Inference object][pyproteininference.inference.Inference]. max_identifiers_peptide_centric (int): Maximum number of identifiers to assign to a group when running peptide_centric inference. Typically this is 10 or 5. lp_solver (str/None): The LP solver to use if inference_type="Parsimony". - Can be any value in LP_SOLVERS in the :py:class:`pyproteininference.inference.Inference` object + Can be any value in LP_SOLVERS in the [Inference object][pyproteininference.inference.Inference]. """ @@ -144,12 +144,10 @@ class ProteinInferenceParameter(object): DIGEST_SUB_KEYS = {DIGEST_TYPE_PARAMETER, MISSED_CLEAV_PARAMETER} LP_SOLVER_PARAMETER = "lp_solver" - GLPK_PATH_PARAMETER = "glpk_path" SHARED_PEPTIDES_PARAMETER = "shared_peptides" PARSIMONY_SUB_KEYS = { LP_SOLVER_PARAMETER, - GLPK_PATH_PARAMETER, SHARED_PEPTIDES_PARAMETER, } @@ -160,7 +158,6 @@ class ProteinInferenceParameter(object): DEFAULT_DIGEST_TYPE = "trypsin" DEFAULT_EXPORT = "peptides" DEFAULT_FDR = 0.01 - DEFAULT_GLPK_PATH = "glpsol" DEFAULT_MISSED_CLEAVAGES = 3 DEFAULT_PICKER = True DEFAULT_RESTRICT_PEP = 0.9 @@ -181,14 +178,14 @@ class ProteinInferenceParameter(object): DEFAULT_SHARED_PEPTIDES = "all" def __init__(self, yaml_param_filepath, validate=True): - """Class to store Protein Inference parameter information as an object + """Class to store Protein Inference parameter information as an object. Args: - yaml_param_filepath (str): path to properly formatted parameter file specific to Protein Inference - validate (bool): True/False on whether to validate the parameter file of interest + yaml_param_filepath (str): path to properly formatted parameter file specific to Protein Inference. + validate (bool): True/False on whether to validate the parameter file of interest. Returns: - None + None: Example: >>> pyproteininference.parameters.ProteinInferenceParameter( @@ -201,7 +198,6 @@ def __init__(self, yaml_param_filepath, validate=True): self.digest_type = self.DEFAULT_DIGEST_TYPE self.export = self.DEFAULT_EXPORT self.fdr = self.DEFAULT_FDR - self.glpk_path = self.DEFAULT_GLPK_PATH self.missed_cleavages = self.DEFAULT_MISSED_CLEAVAGES self.picker = self.DEFAULT_PICKER self.restrict_pep = self.DEFAULT_RESTRICT_PEP @@ -232,17 +228,14 @@ def __init__(self, yaml_param_filepath, validate=True): def convert_to_object(self): """ Function that takes a Protein Inference parameter file and converts it into a ProteinInferenceParameter object - by assigning all Attributes of the ProteinInferenceParameter object + by assigning all Attributes of the ProteinInferenceParameter object. - If no parameter filepath is supplied the parameter object will be loaded with default params + If no parameter filepath is supplied the parameter object will be loaded with default params. - This function gets ran in the initilization of the ProteinInferenceParameter object - - Args: - None + This function gets ran in the initialization of the ProteinInferenceParameter object. Returns: - None + None: """ if self.yaml_param_filepath: @@ -265,12 +258,6 @@ def convert_to_object(self): self.fdr = yaml_params[self.PARENT_PARAMETER_KEY][self.GENERAL_PARAMETER_KEY][self.FDR_PARAMETER] except KeyError: logger.warning("fdr set to default of {}".format(self.DEFAULT_FDR)) - try: - self.glpk_path = yaml_params[self.PARENT_PARAMETER_KEY][self.PARSIMONY_PARAMETER_KEY][ - self.GLPK_PATH_PARAMETER - ] - except KeyError: - logger.warning("glpk_path set to default of {}".format(self.DEFAULT_GLPK_PATH)) try: self.missed_cleavages = yaml_params[self.PARENT_PARAMETER_KEY][self.DIGEST_PARAMETER_KEY][ self.MISSED_CLEAV_PARAMETER @@ -355,7 +342,7 @@ def convert_to_object(self): self.INFERENCE_TYPE_PARAMETER ] except KeyError: - logger.warning("inference_Type set to default of {}".format(self.DEFAULT_INFERENCE_TYPE)) + logger.warning("inference_type set to default of {}".format(self.DEFAULT_INFERENCE_TYPE)) try: self.tag = yaml_params[self.PARENT_PARAMETER_KEY][self.GENERAL_PARAMETER_KEY][self.TAG_PARAMETER] @@ -406,10 +393,10 @@ def convert_to_object(self): def validate_parameters(self): """ - Class method to validate all parameters + Class method to validate all parameters. Returns: - None + None: """ # Run all of the parameter validations @@ -429,7 +416,7 @@ def validate_parameters(self): def _validate_digest_type(self): """ - Internal ProteinInferenceParameter method to validate the digest type + Internal ProteinInferenceParameter method to validate the digest type. """ # Make sure we have a valid digest type if self.digest_type in PyteomicsDigest.LIST_OF_DIGEST_TYPES: @@ -443,7 +430,7 @@ def _validate_digest_type(self): def _validate_export_type(self): """ - Internal ProteinInferenceParameter method to validate the export type + Internal ProteinInferenceParameter method to validate the export type. """ # Make sure we have a valid export type if self.export in Export.EXPORT_TYPES: @@ -458,7 +445,7 @@ def _validate_export_type(self): def _validate_floats(self): """ - Internal ProteinInferenceParameter method to validate floats + Internal ProteinInferenceParameter method to validate floats. """ # Validate that FDR, cleavages, and restrict values are all floats and or ints if they need to be @@ -534,7 +521,7 @@ def _validate_floats(self): def _validate_bools(self): """ - Internal ProteinInferenceParameter method to validate the bools + Internal ProteinInferenceParameter method to validate the bools. """ # Make sure picker is a bool if type(self.picker) == bool: @@ -549,7 +536,7 @@ def _validate_bools(self): def _validate_score_method(self): """ - Internal ProteinInferenceParameter method to validate the score method + Internal ProteinInferenceParameter method to validate the score method. """ # Make sure we have the score method defined in code to use... if self.protein_score in Score.SCORE_METHODS: @@ -564,7 +551,7 @@ def _validate_score_method(self): def _validate_score_type(self): """ - Internal ProteinInferenceParameter method to validate the score type + Internal ProteinInferenceParameter method to validate the score type. """ # Make sure score type is multiplicative or additive if self.psm_score_type in Score.SCORE_TYPES: @@ -579,7 +566,7 @@ def _validate_score_type(self): def _validate_score_combination(self): """ - Internal ProteinInferenceParameter method to validate combination of score method and score type + Internal ProteinInferenceParameter method to validate combination of score method and score type. """ # Check to see if combination of score (column), method(multiplicative log, additive), # and score type (multiplicative/additive) is possible... @@ -608,7 +595,7 @@ def _validate_score_combination(self): def _validate_inference_type(self): """ - Internal ProteinInferenceParameter method to validate the inference type + Internal ProteinInferenceParameter method to validate the inference type. """ # Check if its parsimony, exclusion, inclusion, none if self.inference_type in Inference.INFERENCE_TYPES: @@ -622,7 +609,7 @@ def _validate_inference_type(self): def _validate_grouping_type(self): """ - Internal ProteinInferenceParameter method to validate the grouping type + Internal ProteinInferenceParameter method to validate the grouping type. """ # Check if its parsimony, exclusion, inclusion, none if self.grouping_type in Inference.GROUPING_TYPES: @@ -641,7 +628,7 @@ def _validate_grouping_type(self): def _validate_max_id(self): """ - Internal ProteinInferenceParameter method to validate the max peptide centric id + Internal ProteinInferenceParameter method to validate the max peptide centric id. """ # Check if max_identifiers_peptide_centric param is an INT if type(self.max_identifiers_peptide_centric) == int: @@ -658,9 +645,9 @@ def _validate_max_id(self): def _validate_lp_solver(self): """ - Internal ProteinInferenceParameter method to validate the lp solver + Internal ProteinInferenceParameter method to validate the lp solver. """ - # Check if its pulp, glpk, or None + # Check if its pulp or None if self.lp_solver in Inference.LP_SOLVERS: logger.info("Using LP Solver '{}'".format(self.lp_solver)) else: @@ -676,7 +663,7 @@ def _validate_lp_solver(self): def _validate_parsimony_shared_peptides(self): """ - Internal ProteinInferenceParameter method to validate the shared peptides parameter + Internal ProteinInferenceParameter method to validate the shared peptides parameter. """ # Check if its all, best, or none if self.shared_peptides in Inference.SHARED_PEPTIDE_TYPES: @@ -694,7 +681,7 @@ def _validate_parsimony_shared_peptides(self): def _validate_identifiers(self): """ Internal ProteinInferenceParameter method to validate the decoy symbol, isoform symbol, - and reviewed identifier symbol + and reviewed identifier symbol. """ if type(self.decoy_symbol) == str: @@ -731,7 +718,7 @@ def _validate_identifiers(self): def _validate_parameter_shape(self, yaml_params): """ Internal ProteinInferenceParameter method to validate shape of the parameter file by checking to make sure - that all necessary main parameter fields are defined + that all necessary main parameter fields are defined. """ if self.PARENT_PARAMETER_KEY in yaml_params.keys(): logger.info("Main Parameter Key is Present") @@ -866,7 +853,7 @@ def override_q_restrict(self, data): ProteinInferenceParameter method to override restrict_q if the input data does not contain q values. Args: - data (pyproteininference.datastore.DataStore): Data Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. """ data_has_q = data.input_has_q() @@ -882,7 +869,7 @@ def override_pep_restrict(self, data): ProteinInferenceParameter method to override restrict_pep if the input data does not contain pep values. Args: - data (pyproteininference.datastore.DataStore): Data Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. """ data_has_pep = data.input_has_pep() @@ -901,7 +888,7 @@ def override_custom_restrict(self, data): the input data does not contain custom score values. Args: - data (pyproteininference.datastore.DataStore): Data Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. """ data_has_custom = data.input_has_custom() @@ -917,10 +904,10 @@ def override_custom_restrict(self, data): def fix_parameters_from_datastore(self, data): """ ProteinInferenceParameter method to override restriction values in the - parameter file if those scores do not exist in the input files + parameter file if those scores do not exist in the input files. Args: - data (pyproteininference.datastore.DataStore): Data Object + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. """ @@ -930,39 +917,31 @@ def fix_parameters_from_datastore(self, data): def _fix_none_parameters(self): """ - Internal ProteinInferenceParameter method to fix parameters that have been defined as None - These get read in as strings with YAML reader and need to be converted to None type + Internal ProteinInferenceParameter method to fix parameters that have been defined as None. + These get read in as strings with YAML reader and need to be converted to None type. """ self._fix_grouping_type() - self._fix_glpk_path() self._fix_lp_solver() self._fix_shared_peptides() def _fix_grouping_type(self): """ - Internal ProteinInferenceParameter method to override grouping type for None value + Internal ProteinInferenceParameter method to override grouping type for None value. """ if self.grouping_type in ["None", "none", None]: self.grouping_type = None - def _fix_glpk_path(self): - """ - Internal ProteinInferenceParameter method to override glpk_path for None value - """ - if self.glpk_path in ["None", "none", None]: - self.glpk_path = None - def _fix_lp_solver(self): """ - Internal ProteinInferenceParameter method to override lp_solver for None value + Internal ProteinInferenceParameter method to override lp_solver for None value. """ if self.lp_solver in ["None", "none", None]: self.lp_solver = None def _fix_shared_peptides(self): """ - Internal ProteinInferenceParameter method to override shared_peptides for None value + Internal ProteinInferenceParameter method to override shared_peptides for None value. """ if self.shared_peptides in ["None", "none", None]: self.shared_peptides = None diff --git a/pyproteininference/physical.py b/pyproteininference/physical.py index 228824f..eefa87e 100644 --- a/pyproteininference/physical.py +++ b/pyproteininference/physical.py @@ -5,23 +5,23 @@ class Protein(object): """ The following class is a representation of a Protein that stores characteristics/attributes of a protein for the entire analysis. - We use __slots__ to predefine the attributes the Protein Object can have - This is done to speed up runtime of the PI algorithm + We use __slots__ to predefine the attributes the Protein Object can have. + This is done to speed up runtime of the PI algorithm. Attributes: - identifier (str): String identifier for the Protein object + identifier (str): String identifier for the Protein object. score (float): Float that represents the protein score as output from - :py:class:`pyproteininference.scoring.Score` methods - psms (list): List of :py:class:`pyproteininference.physical.Psm` objects - group_identification (set): Set of group Identifiers that the protein belongs to (int) - reviewed (bool): True/False on if the identifier is reviewed - unreviewed (bool): True/False on if the identifier is reviewed - peptides (list): List of non flanking peptide sequences - peptide_scores (list): List of Psm scores associated with the protein - picked (bool): True/False if the protein passes the picker algo. True if passes. False if does not pass - num_peptides (int): Number of peptides that map to the given Protein - unique_peptides (list): List of peptide strings that are unique to this protein across the analysis - num_unique_peptides (int): Number of unique peptides + [Score object][pyproteininference.scoring.Score] methods. + psms (list): List of [Psm][pyproteininference.physical.Psm] objects. + group_identification (set): Set of group Identifiers that the protein belongs to (int). + reviewed (bool): True/False on if the identifier is reviewed. + unreviewed (bool): True/False on if the identifier is reviewed. + peptides (list): List of non flanking peptide sequences. + peptide_scores (list): List of Psm scores associated with the protein. + picked (bool): True/False if the protein passes the picker algo. True if passes. False if does not pass. + num_peptides (int): Number of peptides that map to the given Protein. + unique_peptides (list): List of peptide strings that are unique to this protein across the analysis. + num_unique_peptides (int): Number of unique peptides. raw_peptides (list): List of raw peptides. Includes flanking AA and Mods. """ @@ -44,10 +44,10 @@ class Protein(object): def __init__(self, identifier): """ - Initialization method for Protein object + Initialization method for Protein object. Args: - identifier (str): String identifier for the Protein object + identifier (str): String identifier for the Protein object. Example: >>> protein = pyproteininference.physical.Protein(identifier = "PRKDC_HUMAN|P78527") @@ -69,10 +69,10 @@ def __init__(self, identifier): def get_psm_scores(self): """ - Retrieves psm scores for a given protein + Retrieves psm scores for a given protein. Returns: - list: List of psm scores for the given protein + list: List of psm scores for the given protein. """ score_list = [x.main_score for x in self.psms] @@ -80,10 +80,10 @@ def get_psm_scores(self): def get_psm_identifiers(self): """ - Retrieves a list of Psm identifiers + Retrieves a list of Psm identifiers. Returns: - list: List of Psm identifiers + list: List of Psm identifiers. """ psms = [x.identifier for x in self.psms] @@ -91,10 +91,10 @@ def get_psm_identifiers(self): def get_stripped_psm_identifiers(self): """ - Retrieves a list of Psm identifiers that have had mods removed and flanking AAs removed + Retrieves a list of Psm identifiers that have had mods removed and flanking AAs removed. Returns: - list: List of Psm identifiers that have no mods or flanking AAs + list: List of Psm identifiers that have no mods or flanking AAs. """ psms = [x.stripped_peptide for x in self.psms] @@ -102,10 +102,10 @@ def get_stripped_psm_identifiers(self): def get_unique_peptide_identifiers(self): """ - Retrieves the unique set of peptides for a protein + Retrieves the unique set of peptides for a protein. Returns: - set: Set of peptide strings + set: Set of peptide strings. """ unique_peptides = set(self.get_psm_identifiers()) @@ -113,10 +113,10 @@ def get_unique_peptide_identifiers(self): def get_unique_stripped_peptide_identifiers(self): """ - Retrieves the unique set of peptides for a protein that are stripped + Retrieves the unique set of peptides for a protein that are stripped. Returns: - set: Set of peptide strings that are stripped of mods and flanking AAs + set: Set of peptide strings that are stripped of mods and flanking AAs. """ stripped_peptide_identifiers = set(self.get_stripped_psm_identifiers()) @@ -124,10 +124,10 @@ def get_unique_stripped_peptide_identifiers(self): def get_num_psms(self): """ - Retrieves the number of Psms + Retrieves the number of Psms. Returns: - int: Number of Psms + int: Number of Psms. """ num_psms = len(self.get_psm_identifiers()) @@ -135,10 +135,10 @@ def get_num_psms(self): def get_num_peptides(self): """ - Retrieves the number of peptides + Retrieves the number of peptides. Returns: - int: Number of peptides + int: Number of peptides. """ num_peptides = len(self.get_unique_peptide_identifiers()) @@ -146,10 +146,10 @@ def get_num_peptides(self): def get_psm_ids(self): """ - Retrieves the Psm Ids + Retrieves the Psm Ids. Returns: - list: List of Psm Ids + list: List of Psm Ids. """ psm_ids = [x.psm_id for x in self.psms] @@ -159,23 +159,23 @@ def get_psm_ids(self): class Psm(object): """ The following class is a physical Psm class that stores characteristics of a psm for the entire analysis. - We use __slots__ to predefine the attributes the Psm Object can have - This is done to speed up runtime of the PI algorithm + We use __slots__ to predefine the attributes the Psm Object can have. + This is done to speed up runtime of the PI algorithm. Attributes: - identifier (str): Peptide Identifier: IE ""K.DLIDEGH#AATQLVNQLHDVVVENNLSDK.Q" - percscore (float): Percolator Score from input file if it exists - qvalue (float): Q value from input file if it exists - pepvalue (float): Pep value from input file if it exists - possible_proteins (list): List of protein strings that the Psm maps to based on the digest + identifier (str): Peptide Identifier: IE "K.DLIDEGH#AATQLVNQLHDVVVENNLSDK.Q". + percscore (float): Percolator Score from input file if it exists. + qvalue (float): Q value from input file if it exists. + pepvalue (float): Pep value from input file if it exists. + possible_proteins (list): List of protein strings that the Psm maps to based on the digest. psm_id (str): String that represents a global identifier for the Psm. Should come from input files. - custom_score (float): Score that comes from a custom column in the input files + custom_score (float): Score that comes from a custom column in the input files. main_score (float): The Psm score to be used as the scoring variable for protein scoring. can be - percscore,qvalue,pepvalue, or custom_score + percscore,qvalue,pepvalue, or custom_score. stripped_peptide (str): This is the identifier attribute that has had mods removed and flanking AAs - removed IE: DLIDEGHAATQLVNQLHDVVVENNLSDK + removed IE: DLIDEGHAATQLVNQLHDVVVENNLSDK. non_flanking_peptide (str): This is the identifier attribute that has had flanking AAs - removed IE: DLIDEGH#AATQLVNQLHDVVVENNLSDK. #NOTE Mods are still present here + removed IE: DLIDEGH#AATQLVNQLHDVVVENNLSDK. #NOTE Mods are still present here. """ @@ -204,11 +204,11 @@ class Psm(object): def __init__(self, identifier): """ - Initialization method for the Psm object - This method also initializes the :attr:`stripped_peptide` and :attr:`non_flanking_peptide` attributes + Initialization method for the Psm object. + This method also initializes the `stripped_peptide` and `non_flanking_peptide` attributes. Args: - identifier (str): Peptide Identifier: IE ""K.DLIDEGH#AATQLVNQLHDVVVENNLSDK.Q" + identifier (str): Peptide Identifier: IE ""K.DLIDEGH#AATQLVNQLHDVVVENNLSDK.Q". Example: >>> psm = pyproteininference.physical.Psm(identifier = "K.DLIDEGHAATQLVNQLHDVVVENNLSDK.Q") @@ -242,13 +242,13 @@ def __init__(self, identifier): @classmethod def remove_peptide_mods(cls, peptide_string): """ - This class method takes a string and uses a :attr:`MOD_REGEX` to remove mods from peptide strings + This class method takes a string and uses a `MOD_REGEX` to remove mods from peptide strings. Args: - peptide_string (str): Peptide string to have mods removed from + peptide_string (str): Peptide string to have mods removed from. Returns: - str: a peptide string with mods removed + str: a peptide string with mods removed. """ stripped_peptide = cls.MOD_REGEX.sub("", peptide_string) @@ -259,15 +259,15 @@ def split_peptide(cls, peptide_string, delimiter="."): """ This class method takes a peptide string with flanking AAs and removes them from the peptide string. This method uses string splitting and if the method produces a faulty peptide the method - :meth:`split_peptide_pro` will be called + [split_peptide_pro][pyproteininference.physical.Psm.split_peptide_pro] will be called. Args: - peptide_string (str): Peptide string to have mods removed from + peptide_string (str): Peptide string to have mods removed from. delimiter (str): a string to indicate what separates a leading/trailing (flanking) AA from the - peptide sequence + peptide sequence. Returns: - str: a peptide string with flanking AAs removed + str: a peptide string with flanking AAs removed. """ peptide_split = peptide_string.split(delimiter) @@ -288,18 +288,18 @@ def split_peptide(cls, peptide_string, delimiter="."): @classmethod def split_peptide_pro(cls, peptide_string, delimiter="."): """ - This class method takes a peptide string with flanking AAs and removes them from the peptide string - This is a specialized method of :meth:`split_peptide` that uses regex identifiers to replace flanking AAs as - opposed to string splitting + This class method takes a peptide string with flanking AAs and removes them from the peptide string. + This is a specialized method of [split_peptide][pyproteininference.physical.Psm.split_peptide] that uses + regex identifiers to replace flanking AAs as opposed to string splitting. Args: - peptide_string (str): Peptide string to have mods removed from + peptide_string (str): Peptide string to have mods removed from. delimiter (str): a string to indicate what separates a leading/trailing (flanking) AA from the peptide - sequence + sequence. Returns: - str: a peptide string with flanking AAs removed + str: a peptide string with flanking AAs removed. """ @@ -323,7 +323,7 @@ def assign_main_score(self, score): Args: score (str): This is a string representation of the Psm attribute that will get assigned to the main_score - variable + variable. """ # Assign a main score based on user input @@ -337,14 +337,14 @@ class ProteinGroup(object): """ The following class is a physical Protein Group class that stores characteristics of a Protein Group for the entire analysis. - We use __slots__ to predefine the attributes the Psm Object can have - This is done to speed up runtime of the PI algorithm + We use __slots__ to predefine the attributes the Psm Object can have. + This is done to speed up runtime of the PI algorithm. Attributes: - number_id (int): unique Integer to represent a group - proteins (list): List of :py:class:`pyproteininference.physical.Protein` objects + number_id (int): unique Integer to represent a group. + proteins (list): List of [Protein][pyproteininference.physical.Protein] objects. q_value (float): Q value for the protein group that is calculated with method - :py:meth:`pyproteininference.datastore.DataStore.calculate_q_values` + [calculate_q_values][pyproteininference.datastore.DataStore.calculate_q_values]. """ @@ -352,10 +352,10 @@ class ProteinGroup(object): def __init__(self, number_id): """ - Initialization method for ProteinGroup object + Initialization method for ProteinGroup object. Args: - number_id (int): unique Integer to represent a group + number_id (int): unique Integer to represent a group. Example: >>> pg = pyproteininference.physical.ProteinGroup(number_id = 1) diff --git a/pyproteininference/pipeline.py b/pyproteininference/pipeline.py index adc4b6e..eb13d2c 100644 --- a/pyproteininference/pipeline.py +++ b/pyproteininference/pipeline.py @@ -18,25 +18,24 @@ class ProteinInferencePipeline(object): """ This is the main Protein Inference class which houses the logic of the entire data analysis pipeline. - Logic is executed in the :py:meth:`pyproteininference.pipeline.ProteinInferencePipeline.execute` method + Logic is executed in the [execute][pyproteininference.pipeline.ProteinInferencePipeline.execute] method. Attributes: - parameter_file (str): Path to Protein Inference Yaml Parameter File - database_file (str): Path to Fasta database used in proteomics search - target_files (str/list): Path to Target Psm File (Or a list of files) - decoy_files (str/list): Path to Decoy Psm File (Or a list of files) - combined_files (str/list): Path to Combined Psm File (Or a list of files) - target_directory (str): Path to Directory containing Target Psm Files - decoy_directory (str): Path to Directory containing Decoy Psm Files - combined_directory (str): Path to Directory containing Combined Psm Files - output_directory (str): Path to Directory where output will be written - output_filename (str): Path to Filename where output will be written. Will override output_directory - id_splitting (bool): True/False on whether to split protein IDs in the digest. Leave as False unless you - know what you are doing + parameter_file (str): Path to Protein Inference Yaml Parameter File. + database_file (str): Path to Fasta database used in proteomics search. + target_files (str/list): Path to Target Psm File (Or a list of files). + decoy_files (str/list): Path to Decoy Psm File (Or a list of files). + combined_files (str/list): Path to Combined Psm File (Or a list of files). + target_directory (str): Path to Directory containing Target Psm Files. + decoy_directory (str): Path to Directory containing Decoy Psm Files. + combined_directory (str): Path to Directory containing Combined Psm Files. + output_directory (str): Path to Directory where output will be written. + output_filename (str): Path to Filename where output will be written. Will override output_directory. + id_splitting (bool): True/False on whether to split protein IDs in the digest. Advanced usage only. append_alt_from_db (bool): True/False on whether to append alternative proteins from the DB digestion in - Reader class - data (pyproteininference.datastore.DataStore): Data Class - digest (pyproteininference.in_silico_digest.Digest): Digest Class + Reader class. + data (DataStore): [DataStore Object][pyproteininference.datastore.DataStore]. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. """ @@ -58,20 +57,19 @@ def __init__( """ Args: - parameter_file (str): Path to Protein Inference Yaml Parameter File - database_file (str): Path to Fasta database used in proteomics search - target_files (str/list): Path to Target Psm File (Or a list of files) - decoy_files (str/list): Path to Decoy Psm File (Or a list of files) - combined_files (str/list): Path to Combined Psm File (Or a list of files) - target_directory (str): Path to Directory containing Target Psm Files - decoy_directory (str): Path to Directory containing Decoy Psm Files - combined_directory (str): Path to Directory containing Combined Psm Files - output_filename (str): Path to Filename where output will be written. Will override output_directory - output_directory (str): Path to Directory where output will be written - id_splitting (bool): True/False on whether to split protein IDs in the digest. Leave as False unless you - know what you are doing + parameter_file (str): Path to Protein Inference Yaml Parameter File. + database_file (str): Path to Fasta database used in proteomics search. + target_files (str/list): Path to Target Psm File (Or a list of files). + decoy_files (str/list): Path to Decoy Psm File (Or a list of files). + combined_files (str/list): Path to Combined Psm File (Or a list of files). + target_directory (str): Path to Directory containing Target Psm Files. + decoy_directory (str): Path to Directory containing Decoy Psm Files. + combined_directory (str): Path to Directory containing Combined Psm Files. + output_filename (str): Path to Filename where output will be written. Will override output_directory. + output_directory (str): Path to Directory where output will be written. + id_splitting (bool): True/False on whether to split protein IDs in the digest. Advanced usage only. append_alt_from_db (bool): True/False on whether to append alternative proteins from the DB digestion in - Reader class + Reader class. Returns: object: @@ -118,23 +116,23 @@ def __init__( def execute(self): """ This method is the main driver of the data analysis for the protein inference package. - This method calls other classes and methods that make up the protein inference pipeline + This method calls other classes and methods that make up the protein inference pipeline. This includes but is not limited to: - This method sets the data :py:class:`pyproteininference.datastore.DataStore` and digest - :py:class:`pyproteininference.in_silico_digest.Digest` objects. - - 1. Parameter file management - 2. Digesting Fasta Database (Optional) - 3. Reading in input Psm Files - 4. Initializing the :py:class:`pyproteininference.datastore.DataStore` object - 5. Restricting Psms - 6. Creating Protein objects/scoring input - 7. Scoring Proteins - 8. Running Protein Picker - 9. Running Inference Methods/Grouping - 10. Calculating Q Values - 11. Exporting Proteins to filesystem + This method sets the data [DataStore Object][pyproteininference.datastore.DataStore] and digest + [Digest Object][pyproteininference.in_silico_digest.Digest]. + + 1. Parameter file management. + 2. Digesting Fasta Database (Optional). + 3. Reading in input Psm Files. + 4. Initializing the [DataStore Object][pyproteininference.datastore.DataStore]. + 5. Restricting Psms. + 6. Creating Protein objects/scoring input. + 7. Scoring Proteins. + 8. Running Protein Picker. + 9. Running Inference Methods/Grouping. + 10. Calculating Q Values. + 11. Exporting Proteins to filesystem. Example: >>> pipeline = pyproteininference.pipeline.ProteinInferencePipeline( @@ -261,14 +259,14 @@ def _validate_input(self): One of the following combinations must be selected as input. No more and no less: - 1. either one or multiple target_files and decoy_files, - 2. either one or multiple combined_files that include target and decoy data + 1. either one or multiple target_files and decoy_files. + 2. either one or multiple combined_files that include target and decoy data. 3. a directory that contains target files (target_directory) as well as a directory that contains decoy files - (decoy_directory) - 4. a directory that contains combined target/decoy files (combined_directory) + (decoy_directory). + 4. a directory that contains combined target/decoy files (combined_directory). Raises: - ValueError: ValueError will occur if an improper combination of + ValueError: ValueError will occur if an improper combination of input. """ if ( self.target_files @@ -322,7 +320,7 @@ def _transform_directory_to_files(self): """ This internal method takes files that are in the target_directory, decoy_directory, or combined_directory and reassigns these files to the target_files, decoy_files, and combined_files to be used in - :py:class:`pyproteininference.reader.Reader` object + [Reader][pyproteininference.reader.Reader] object. """ if self.target_directory and self.decoy_directory: logger.info("Transforming target_directory and decoy_directory into files") @@ -352,7 +350,7 @@ def _transform_directory_to_files(self): def _set_output_directory(self): """ Internal method for setting the output directory. - If the output_directory argument is not supplied the output directory is set as the cwd + If the output_directory argument is not supplied the output directory is set as the cwd. """ if not self.output_directory: self.output_directory = os.getcwd() @@ -361,7 +359,7 @@ def _set_output_directory(self): def _log_append_alt_from_db(self): """ - Internal method for logging whether the user sets alternative protein append to True or False + Internal method for logging whether the user sets alternative protein append to True or False. """ if self.append_alt_from_db: logger.info("Append Alternative Proteins from Database set to True") @@ -370,7 +368,7 @@ def _log_append_alt_from_db(self): def _log_id_splitting(self): """ - Internal method for logging whether the user sets ID splitting to True or False + Internal method for logging whether the user sets ID splitting to True or False. """ if self.id_splitting: logger.info("ID Splitting for Database Digestion set to True") diff --git a/pyproteininference/reader.py b/pyproteininference/reader.py index dfa6863..87385ff 100644 --- a/pyproteininference/reader.py +++ b/pyproteininference/reader.py @@ -21,13 +21,13 @@ class Reader(object): """ - Main Reader Class which is parent to all reader subclasses + Main Reader Class which is parent to all reader subclasses. Attributes: - target_file (str/list): Path to Target PSM result files - decoy_file (str/list): Path to Decoy PSM result files - combined_files (str/list): Path to Combined PSM result files - directory (str): Path to directory containing combined PSM result files + target_file (str/list): Path to Target PSM result files. + decoy_file (str/list): Path to Decoy PSM result files. + combined_files (str/list): Path to Combined PSM result files. + directory (str): Path to directory containing combined PSM result files. """ @@ -37,10 +37,10 @@ def __init__(self, target_file=None, decoy_file=None, combined_files=None, direc """ Args: - target_file (str/list): Path to Target PSM result files - decoy_file (str/list): Path to Decoy PSM result files - combined_files (str/list): Path to Combined PSM result files - directory (str): Path to directory containing combined PSM result files + target_file (str/list): Path to Target PSM result files. + decoy_file (str/list): Path to Decoy PSM result files. + combined_files (str/list): Path to Combined PSM result files. + directory (str): Path to directory containing combined PSM result files. """ self.target_file = target_file @@ -50,22 +50,23 @@ def __init__(self, target_file=None, decoy_file=None, combined_files=None, direc def get_alternative_proteins_from_input(self, row): """ - Method to get the alternative proteins from the input files + Method to get the alternative proteins from the input files. """ if None in row.keys(): try: row["alternative_proteins"] = row.pop(None) + # Sort the alternative proteins - when they are read in they become unsorted + row["alternative_proteins"] = sorted(row["alternative_proteins"]) except KeyError: row["alternative_proteins"] = [] else: - logger.warning("Alternative Proteins not found in the input files. Make sure format is proper.") row["alternative_proteins"] = [] return row def _validate_input(self): """ - Internal method to validate the input to Reader + Internal method to validate the input to Reader. """ if self.target_file and self.decoy_file and not self.combined_files and not self.directory: @@ -93,20 +94,20 @@ def _fix_alternative_proteins( ): """ Internal method to fix the alternative proteins variable for a given - :py:class:`pyproteininference.physical.Psm` object + [Psm][pyproteininference.physical.Psm] object. Args: append_alt_from_db (bool): Whether or not to append alternative proteins found in the database that are - not in the input files - identifiers_sorted (list): List of sorted Protein Strings for the given Psm - max_proteins (int): Maximum number of proteins that a :py:class:`pyproteininference.physical.Psm` - is allowed to map to - psm: (pyproteininference.physical.Psm): Psm object of interest - parameter_file_object: (pyproteininference.parameters.ProteinInferenceParameter): - Protein Inference Parameter Object + not in the input files. + identifiers_sorted (list): List of sorted Protein Strings for the given Psm. + max_proteins (int): Maximum number of proteins that a [Psm][pyproteininference.physical.Psm] + is allowed to map to. + psm: (Psm): [Psm][pyproteininference.physical.Psm] object of interest. + parameter_file_object: (ProteinInferenceParameter): + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter]. Returns: - pyproteininference.physical.Psm: Psm with alternative proteins fixed + pyproteininference.physical.Psm: [Psm][pyproteininference.physical.Psm] with alternative proteins fixed. """ # If we are appending alternative proteins from the db @@ -133,7 +134,7 @@ def _fix_alternative_proteins( def _check_initial_database_overlap(self, initial_possible_proteins, initial_protein_peptide_map): """ Internal method that checks to make sure there is at least some overlap between proteins in the input files - And the proteins in the database digestion + And the proteins in the database digestion. """ if len(initial_protein_peptide_map.keys()) > 0: @@ -165,27 +166,28 @@ def _check_initial_database_overlap(self, initial_possible_proteins, initial_pro class PercolatorReader(Reader): """ The following class takes a percolator target file and a percolator decoy file - or combined files/directory and creates standard :py:class:`pyproteininference.physical.Psm` objects. - This reader class is used as input for :py:class:`pyproteininference.datastore.DataStore` + or combined files/directory and creates standard [Psm][pyproteininference.physical.Psm] objects. + This reader class is used as input for [DataStore object][pyproteininference.datastore.DataStore]. Percolator Output is formatted as follows: - with each entry being tabbed delimited (Comma separated showed below) - PSMId score q-value posterior_error_prob peptide proteinIds - 116108.15139.15139.6.dta 3.44016 0.000479928 7.60258e-10 K.MVVSMTLGLHPWIANIDDTQYLAAK.R CNDP1_HUMAN|Q96KN2 - B4E180_HUMAN|B4E180 A8K1K1_HUMAN|A8K1K1 J3KRP0_HUMAN|J3KRP0 + with each entry being tab delimited. + + | PSMId | score | q-value | posterior_error_prob | peptide | proteinIds | | | | # noqa E501 W605 + |-------------------------------|----------|-------------|-----------------------|--------------------------------|---------------------|----------------------|----------------------|-------------------------| # noqa E501 W605 + | 116108.15139.15139.6.dta | 3.44016 | 0.000479928 | 7.60258e-10 | K.MVVSMTLGLHPWIANIDDTQYLAAK.R | CNDP1_HUMAN\|Q96KN2 | B4E180_HUMAN\|B4E180 | A8K1K1_HUMAN\|A8K1K1 | J3KRP0_HUMAN\|J3KRP0 | # noqa E501 W605 Attributes: - target_file (str/list): Path to Target PSM result files - decoy_file (str/list): Path to Decoy PSM result files - combined_files (str/list): Path to Combined PSM result files - directory (str): Path to directory containing combined PSM result files - psmid_index (int): Index of the PSMId from the input files - perc_score_index (int): Index of the Percolator score from the input files - q_value_index (int): Index of the q-value from the input files - posterior_error_prob_index (int): Index of the posterior error probability from the input files - peptide_index (int): Index of the peptides from the input files - proteinIDs_index (int): Index of the proteins from the input files - psms (list): List of :py:class:`pyproteininference.physical.Psm` objects + target_file (str/list): Path to Target PSM result files. + decoy_file (str/list): Path to Decoy PSM result files. + combined_files (str/list): Path to Combined PSM result files. + directory (str): Path to directory containing combined PSM result files. + PSMID_INDEX (int): Index of the PSMId from the input files. + PERC_SCORE_INDEX (int): Index of the Percolator score from the input files. + Q_VALUE_INDEX (int): Index of the q-value from the input files. + POSTERIOR_ERROR_PROB_INDEX (int): Index of the posterior error probability from the input files. + PEPTIDE_INDEX (int): Index of the peptides from the input files. + PROTEINIDS_INDEX (int): Index of the proteins from the input files. + psms (list): List of [Psm][pyproteininference.physical.Psm] objects. """ @@ -209,17 +211,18 @@ def __init__( """ Args: - digest (pyproteininference.in_silico_digest.Digest): - parameter_file_object (pyproteininference.parameters.ProteinInferenceParameter): + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + parameter_file_object (ProteinInferenceParameter): + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter]. append_alt_from_db (bool): Whether or not to append alternative proteins found in the database that - are not in the input files - target_file (str/list): Path to Target PSM result files - decoy_file (str/list): Path to Decoy PSM result files - combined_files (str/list): Path to Combined PSM result files - directory (str): Path to directory containing combined PSM result files + are not in the input files. + target_file (str/list): Path to Target PSM result files. + decoy_file (str/list): Path to Decoy PSM result files. + combined_files (str/list): Path to Combined PSM result files. + directory (str): Path to directory containing combined PSM result files. Returns: - object: + Reader: [Reader][pyproteininference.reader.Reader] object. Example: >>> pyproteininference.reader.PercolatorReader(target_file = "example_target.txt", @@ -244,11 +247,11 @@ def __init__( def read_psms(self): """ Method to read psms from the input files and to transform them into a list of - :py:class:`pyproteininference.physical.Psm` objects + [Psm][pyproteininference.physical.Psm] objects. - This method sets the :attr:`psms` variable. Which is a list of Psm objets + This method sets the `psms` variable. Which is a list of Psm objets. - This method must be ran before initializing :py:class:`pyproteininference.datastore.DataStore` + This method must be ran before initializing [DataStore object][pyproteininference.datastore.DataStore]. Example: >>> reader = pyproteininference.reader.PercolatorReader(target_file = "example_target.txt", @@ -380,14 +383,8 @@ def read_psms(self): if self.parameter_file_object.inference_type == Inference.FIRST_PROTEIN: poss_proteins = [psm_info[self.PROTEINIDS_INDEX]] else: - poss_proteins = list( - set( - psm_info[ - self.PROTEINIDS_INDEX : self.PROTEINIDS_INDEX # noqa E203 - + self.MAX_ALLOWED_ALTERNATIVE_PROTEINS - ] - ) - ) + poss_proteins = sorted(list(set(psm_info[self.PROTEINIDS_INDEX :]))) # noqa E203 + poss_proteins = poss_proteins[: self.MAX_ALLOWED_ALTERNATIVE_PROTEINS] combined_psm_result_rows.possible_proteins = poss_proteins # Restrict to 50 total possible proteins... combined_psm_result_rows.psm_id = psm_info[self.PSMID_INDEX] input_poss_prots = copy.copy(poss_proteins) @@ -403,8 +400,8 @@ def read_psms(self): # Add the other possible_proteins from insilicodigest here... try: - current_alt_proteins = list( - peptide_to_protein_dictionary[current_peptide] + current_alt_proteins = sorted( + list(peptide_to_protein_dictionary[current_peptide]) ) # This peptide needs to be scrubbed of Mods... except KeyError: current_alt_proteins = [] @@ -460,16 +457,17 @@ def read_psms(self): class ProteologicPostSearchReader(Reader): """ - This class is used to read from post processing proteologic logical object + This class is used to read from post processing proteologic logical object. Attributes: - proteologic_object (list): List of proteologic post search objects - search_id (int): Search ID or Search IDs associated with the data - postsearch_id: PostSearch ID or PostSearch IDs associated with the data - digest (pyproteininference.in_silico_digest.Digest): - parameter_file_object (pyproteininference.parameters.ProteinInferenceParameter): + proteologic_object (list): List of proteologic post search objects. + search_id (int): Search ID or Search IDs associated with the data. + postsearch_id (int): PostSearch ID or PostSearch IDs associated with the data. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + parameter_file_object (ProteinInferenceParameter): + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] object. append_alt_from_db (bool): Whether or not to append alternative proteins found in the database - that are not in the input files + that are not in the input files. """ @@ -485,13 +483,14 @@ def __init__( """ Args: - proteologic_object (list): List of proteologic post search objects - search_id (int): Search ID or Search IDs associated with the data - postsearch_id: PostSearch ID or PostSearch IDs associated with the data - digest (pyproteininference.in_silico_digest.Digest): - parameter_file_object (pyproteininference.parameters.ProteinInferenceParameter): + proteologic_object (list): List of proteologic post search objects. + search_id (int): Search ID or Search IDs associated with the data. + postsearch_id: PostSearch ID or PostSearch IDs associated with the data. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + parameter_file_object (ProteinInferenceParameter): + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] object. append_alt_from_db (bool): Whether or not to append alternative proteins found in the database - that are not in the input files + that are not in the input files. Returns: @@ -511,11 +510,11 @@ def __init__( def read_psms(self): """ Method to read psms from the input files and to transform them into a list of - :py:class:`pyproteininference.physical.Psm` objects + [Psm][pyproteininference.physical.Psm] objects. - This method sets the :attr:`psms` variable. Which is a list of Psm objets + This method sets the `psms` variable. Which is a list of Psm objets. - This method must be ran before initializing :py:class:`pyproteininference.datastore.DataStore` + This method must be ran before initializing [DataStore object][pyproteininference.datastore.DataStore]. """ logger.info("Reading in data from Proteologic...") @@ -569,8 +568,8 @@ def read_psms(self): # Add the other possible_proteins from insilicodigest here... try: - current_alt_proteins = list( - peptide_to_protein_dictionary[current_peptide] + current_alt_proteins = sorted( + list(peptide_to_protein_dictionary[current_peptide]) ) # This peptide needs to be scrubbed of Mods... except KeyError: current_alt_proteins = [] @@ -619,28 +618,30 @@ def read_psms(self): class GenericReader(Reader): """ The following class takes a percolator like target file and a percolator like decoy file - and creates standard :py:class:`pyproteininference.physical.Psm` objects. + and creates standard [Psm][pyproteininference.physical.Psm] objects. Percolator Like Output is formatted as follows: - with each entry being tabbed delimited (Comma separated showed below) - PSMId score q-value posterior_error_prob peptide proteinIds - 116108.15139.15139.6.dta 3.44016 0.000479928 7.60258e-10 K.MVVSMTLGLHPWIANIDDTQYLAAK.R CNDP1_HUMAN|Q96KN2 - B4E180_HUMAN|B4E180 A8K1K1_HUMAN|A8K1K1 J3KRP0_HUMAN|J3KRP0 + with each entry being tab delimited. + + | PSMId | score | q-value | posterior_error_prob | peptide | proteinIds | | | | # noqa E501 W605 + |-------------------------------|----------|-------------|-----------------------|--------------------------------|---------------------|----------------------|----------------------|-------------------------| # noqa E501 W605 + | 116108.15139.15139.6.dta | 3.44016 | 0.000479928 | 7.60258e-10 | K.MVVSMTLGLHPWIANIDDTQYLAAK.R | CNDP1_HUMAN\|Q96KN2 | B4E180_HUMAN\|B4E180 | A8K1K1_HUMAN\|A8K1K1 | J3KRP0_HUMAN\|J3KRP0 | # noqa E501 W605 - Custom columns can be added and used as scoring input. Please see README.md for more information + Custom columns can be added and used as scoring input. Please see package documentation for more information. Attributes: - target_file (str/list): Path to Target PSM result files - decoy_file (str/list): Path to Decoy PSM result files - combined_files (str/list): Path to Combined PSM result files - directory (str): Path to directory containing combined PSM result files - psms (list): List of :py:class:`pyproteininference.physical.Psm` objects - load_custom_score (bool): True/False on whether or not to load a custom score. Depends on scoring_variable - scoring_variable (str): String to indicate which column in the input file is to be used as the scoring input - digest (pyproteininference.in_silico_digest.Digest): - parameter_file_object (pyproteininference.parameters.ProteinInferenceParameter): + target_file (str/list): Path to Target PSM result files. + decoy_file (str/list): Path to Decoy PSM result files. + combined_files (str/list): Path to Combined PSM result files. + directory (str): Path to directory containing combined PSM result files. + psms (list): List of [Psm][pyproteininference.physical.Psm] objects. + load_custom_score (bool): True/False on whether or not to load a custom score. Depends on scoring_variable. + scoring_variable (str): String to indicate which column in the input file is to be used as the scoring input. + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + parameter_file_object (ProteinInferenceParameter): + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] object append_alt_from_db (bool): Whether or not to append alternative proteins found in the database that - are not in the input files + are not in the input files. @@ -667,17 +668,18 @@ def __init__( """ Args: - digest (pyproteininference.in_silico_digest.Digest): - parameter_file_object (pyproteininference.parameters.ProteinInferenceParameter): + digest (Digest): [Digest Object][pyproteininference.in_silico_digest.Digest]. + parameter_file_object (ProteinInferenceParameter): + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] object. append_alt_from_db (bool): Whether or not to append alternative proteins found in the database that - are not in the input files - target_file (str/list): Path to Target PSM result files - decoy_file (str/list): Path to Decoy PSM result files - combined_files (str/list): Path to Combined PSM result files - directory (str): Path to directory containing combined PSM result files + are not in the input files. + target_file (str/list): Path to Target PSM result files. + decoy_file (str/list): Path to Decoy PSM result files. + combined_files (str/list): Path to Combined PSM result files. + directory (str): Path to directory containing combined PSM result files. Returns: - object: + Reader: [Reader][pyproteininference.reader.Reader] object. Example: >>> pyproteininference.reader.GenericReader(target_file = "example_target.txt", @@ -724,11 +726,11 @@ def __init__( def read_psms(self): """ Method to read psms from the input files and to transform them into a list of - :py:class:`pyproteininference.physical.Psm` objects + [Psm][pyproteininference.physical.Psm] objects. - This method sets the :attr:`psms` variable. Which is a list of Psm objets + This method sets the `psms` variable. Which is a list of Psm objets. - This method must be ran before initializing :py:class:`pyproteininference.datastore.DataStore` + This method must be ran before initializing [DataStore object][pyproteininference.datastore.DataStore]. Example: >>> reader = pyproteininference.reader.GenericReader(target_file = "example_target.txt", @@ -873,7 +875,19 @@ def read_psms(self): peptide_to_protein_dictionary = self.digest.peptide_to_protein_dictionary initial_poss_prots = [] - logger.info("Length of PSM Data: {}".format(len(all_psms))) + logger.info("Number of PSMs in the input data: {}".format(len(all_psms))) + psms_with_alternative_proteins = self._find_psms_with_alternative_proteins(raw_psms=all_psms) + logger.info( + "Number of PSMs that have alternative proteins in the input data {}".format( + len(psms_with_alternative_proteins) + ) + ) + if len(psms_with_alternative_proteins) == 0: + logger.warning( + "No PSMs in the input have alternative proteins. " + "Make sure your input is properly formatted. " + "Alternative Proteins will be retrieved from the fasta database" + ) for psm_info in all_psms: current_peptide = psm_info[self.PEPTIDE] # Define the Psm... @@ -902,7 +916,7 @@ def read_psms(self): psm.possible_proteins = psm.possible_proteins + [x for x in psm_info[self.ALTERNATIVE_PROTEINS] if x] # Remove potential Repeats if self.parameter_file_object.inference_type != Inference.FIRST_PROTEIN: - psm.possible_proteins = list(set(psm.possible_proteins)) + psm.possible_proteins = sorted(list(set(psm.possible_proteins))) input_poss_prots = copy.copy(psm.possible_proteins) @@ -919,7 +933,7 @@ def read_psms(self): current_peptide = stripped_peptide # Add the other possible_proteins from insilicodigest here... try: - current_alt_proteins = list(peptide_to_protein_dictionary[current_peptide]) + current_alt_proteins = sorted(list(peptide_to_protein_dictionary[current_peptide])) except KeyError: current_alt_proteins = [] logger.debug( @@ -964,3 +978,9 @@ def read_psms(self): logger.info("Length of PSM Data: {}".format(len(self.psms))) logger.info("Finished GenericReader.read_psms...") + + def _find_psms_with_alternative_proteins(self, raw_psms): + + psms_with_alternative_proteins = [x for x in raw_psms if x["alternative_proteins"]] + + return psms_with_alternative_proteins diff --git a/pyproteininference/scoring.py b/pyproteininference/scoring.py index cfea83a..ab6f7ec 100644 --- a/pyproteininference/scoring.py +++ b/pyproteininference/scoring.py @@ -18,27 +18,28 @@ class Score(object): """ Score class that contains methods to do a variety of scoring methods on the - :py:class:`pyproteininference.physical.Psm` objects - contained inside of :py:class:`pyproteininference.physical.Protein` objects + [Psm][pyproteininference.physical.Psm] objects + contained inside of [Protein][pyproteininference.physical.Protein] objects. Methods in the class loop over each Protein object and creates a protein "score" variable using the Psm object scores. - Methods score all proteins from :attr:`scoring_input` from :py:class:`pyproteininference.datastore.DataStore`. + Methods score all proteins from `scoring_input` from [DataStore object][pyproteininference.datastore.DataStore]. The PSM score that is used is determined from - :py:meth:`pyproteininference.datastore.DataStore.create_scoring_input` + [create_scoring_input][pyproteininference.datastore.DataStore.create_scoring_input]. - Each scoring method will set the following attributes for :py:class:`pyproteininference.datastore.DataStore` + Each scoring method will set the following attributes for + the [DataStore object][pyproteininference.datastore.DataStore]. - 1. attr:`score_method`; This is the full name of the score method - 2. attr:`short_score_method`; This is the short name of the score method - 3. attr:`scored_proteins`; This is a list of :py:class:`pyproteininference.physical.Protein` objects - that have been scored + 1. `score_method`; This is the full name of the score method. + 2. `short_score_method`; This is the short name of the score method. + 3. `scored_proteins`; This is a list of [Protein][pyproteininference.physical.Protein] objects + that have been scored. Attributes: - pre_score_data (list): This is a list of :py:class:`pyproteininference.physical.Protein` objects that - contain :py:class:`pyproteininference.physical.Psm` objects - data (pyproteininference.datastore.DataStore): Data Object + pre_score_data (list): This is a list of [Protein][pyproteininference.physical.Protein] objects + that contain [Psm][pyproteininference.physical.Psm] objects. + data (DataStore): [DataStore][pyproteininference.datastore.DataStore] object. """ @@ -89,14 +90,14 @@ class Score(object): def __init__(self, data): """ - Initialization method for the Score class + Initialization method for the Score class. Args: - data (pyproteininference.datastore.DataStore): Data class object + data (DataStore): [DataStore][pyproteininference.datastore.DataStore] object. Raises: - ValueError: If the variable :attr:`scoring_input` for :py:class:`pyproteininference.datastore.DataStore` - is Empty "[]" or does not exist "None" + ValueError: If the variable `scoring_input` for the [DataStore][pyproteininference.datastore.DataStore] + object is Empty "[]" or does not exist "None". Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -112,14 +113,14 @@ def __init__(self, data): def score_psms(self, score_method="multiplicative_log"): """ - This method dispatches to the actual scoring method given a string input that is defined in - :py:class:`pyproteininference.parameters.ProteinInferenceParameter` + This method dispatches to the actual scoring method given a string input that is defined in the + [ProteinInferenceParameter][pyproteininference.parameters.ProteinInferenceParameter] object. Args: score_method (str): This is a string that represents which scoring method to call. Raises: - ValueError: Will Error out if the score_method is not present in the constant :attr:`SCORE_METHODS` + ValueError: Will Error out if the score_method is not present in the constant `SCORE_METHODS`. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -153,7 +154,7 @@ def score_psms(self, score_method="multiplicative_log"): def best_peptide_per_protein(self): """ This method uses a best peptide per protein scoring scheme. - The top scoring Psm for each protein is selected as the overall Protein object score + The top scoring Psm for each protein is selected as the overall Protein object score. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -180,7 +181,7 @@ def best_peptide_per_protein(self): def fishers_method(self): """ - This method uses a fishers method scoring scheme + This method uses a fishers method scoring scheme. \ Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -207,7 +208,7 @@ def multiplicative_log(self): """ This method uses a Multiplicative Log scoring scheme. The selected Psm score from all the peptides per protein are multiplied together and we take -Log(X) - of the multiplied Peptide scores + of the multiplied Peptide scores. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -218,7 +219,6 @@ def multiplicative_log(self): all_scores = [] logger.info("Scoring Proteins with Multiplicative Log Method") - logger.info("Using Generators") for protein in self.pre_score_data: # We create a generator of val_list... val_list = protein.get_psm_scores() @@ -243,7 +243,7 @@ def down_weighted_multiplicative_log(self): This method uses a Multiplicative Log scoring scheme. The selected PSM score from all the peptides per protein are multiplied together and then this number is divided by the set PSM scores mean raised to the number of peptides for that protein - then we take -Log(X) of the following value + then we take -Log(X) of the following value. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -282,7 +282,7 @@ def top_two_combied(self): """ This method uses a Top Two scoring scheme. The top two scores for each protein are multiplied together and we take -Log(X) of the multiplied value. - If a protein only has 1 score/peptide, then we only do -Log(X) of the 1 peptide score + If a protein only has 1 score/peptide, then we only do -Log(X) of the 1 peptide score. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -317,10 +317,10 @@ def down_weighted_v2(self): This method uses a Downweighted Multiplicative Log scoring scheme. Each peptide is iteratively downweighted by raising the peptide QValue or PepValue to the following power (1/(1+index_number)). - Where index_number is the peptide number per protein... - Each score for a protein provides less and less weight iteratively + Where index_number is the peptide number per protein. + Each score for a protein provides less and less weight iteratively. - We also take -Log(X) of the final score here + We also take -Log(X) of the final score here. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -353,11 +353,11 @@ def iterative_down_weighted_log(self): """ This method uses a Downweighted Multiplicative Log scoring scheme. Each peptide is iteratively downweighted by multiplying the peptide QValue or PepValue to - the following (1+index_number). - Where index_number is the peptide number per protein... - Each score for a protein provides less and less weight iteratively + the following (1+index_number). + Where index_number is the peptide number per protein. + Each score for a protein provides less and less weight iteratively. - We also take -Log(X) of the final score here + We also take -Log(X) of the final score here. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -394,7 +394,7 @@ def geometric_mean_log(self): """ This method uses a Geometric Mean scoring scheme. - We also take -Log(X) of the final score here + We also take -Log(X) of the final score here. Examples: >>> score = pyproteininference.scoring.Score(data=data) @@ -427,7 +427,7 @@ def geometric_mean_log(self): def iterative_down_weighted_v2(self): """ The following method is an experimental method essentially used for future development of potential scoring - schemes + schemes. """ all_scores = [] diff --git a/requirements.txt b/requirements.txt index 5f2e968..be4512e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -numpy>=1.19.2 -pyteomics>=3.5.1 -pulp>=2.4 -PyYAML>=5.3 -matplotlib>=3.3.4 \ No newline at end of file +numpy>=1.19.2,<2.0.0 +pyteomics>=3.5.1,<4.0.0 +pulp>=2.6,<3.0.0 +PyYAML>=5.3,<6.0.0 +matplotlib>=3.3.4,<4.0.0 \ No newline at end of file diff --git a/scripts/protein_inference_cli.py b/scripts/protein_inference_cli.py index d348e1a..0d78f67 100644 --- a/scripts/protein_inference_cli.py +++ b/scripts/protein_inference_cli.py @@ -1,3 +1,5 @@ +#!/usr/bin/python + import argparse import pyproteininference @@ -5,7 +7,7 @@ def main(): """ - Script function for running the execute method of the ProteinInferencePipeline class + Script function for running the execute method of the ProteinInferencePipeline class. """ parser = argparse.ArgumentParser(description="Protein Inference") diff --git a/scripts/protein_inference_heuristic_cli.py b/scripts/protein_inference_heuristic_cli.py index cd3130e..bae10cb 100644 --- a/scripts/protein_inference_heuristic_cli.py +++ b/scripts/protein_inference_heuristic_cli.py @@ -1,3 +1,5 @@ +#!/usr/bin/python + import argparse import pyproteininference @@ -5,7 +7,7 @@ def main(): """ - Script function for running the execute method of the ProteinInference HeuristicPipeline class + Script function for running the execute method of the ProteinInference HeuristicPipeline class. """ parser = argparse.ArgumentParser(description="Protein Inference Heuristic") @@ -125,23 +127,23 @@ def main(): ) parser.add_argument( "-r", - "--roc_plot_filepath", - dest="roc_plot_filepath", + "--pdf_filename", + dest="pdf_filename", required=False, - help="PDF Filepath to write the ROC plot to after Heuristic Scoring. " - "If not set, writes the file with filename roc_plot.pdf to directory set in -o. " + help="PDF Filepath to write the Heuristic plot to after Heuristic Scoring. " + "If not set, writes the file with filename heuristic_plot.pdf to directory set in -o. " "If -o is not set, will write the file to current working directory.", metavar="FILE", ) parser.add_argument( "-m", - "--fdr_max", - dest="fdr_max", + "--fdr_threshold", + dest="fdr_threshold", required=False, - help="The maximum FDR to display in the ROC plot. Defaults to 0.1 if not set.", + help="The FDR threshold to use in the Heuristic Method. Defaults to 0.05 if not set.", metavar="FLOAT", type=float, - default=0.1, + default=0.05, ) parser.add_argument( "-u", @@ -172,11 +174,10 @@ def main(): output_filename=args.output_filename, append_alt_from_db=not args.skip_append_alt, # Need to reverse the Boolean here id_splitting=args.id_splitting, - roc_plot_filepath=args.roc_plot_filepath, - fdr_max=args.fdr_max, + pdf_filename=args.pdf_filename, output_type=args.output_type, ) - pipeline.execute() + pipeline.execute(fdr_threshold=args.fdr_threshold) if __name__ == "__main__": diff --git a/setup.cfg b/setup.cfg index 744506a..4c1a568 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,45 +1,24 @@ [metadata] name = pyproteininference author = Trent Hinkle -author-email = hinklet@gene.com -summary = Python Package for running custom Protein Inference on tab-formatted tandem MS/MS search results -description-file = README.md -home-page = http://gene.com -version = 0.9.3 +author_email = hinklet@gene.com +summary = Python Package for running custom protein inference algorithms on tab-formatted tandem MS/MS search results. +description_file = README.md +home_page = https://github.com/thinkle12/pyproteininference +version = 1.0.0 license = Apache-2 -#classifier = -# Development Status :: 4 - Beta -# Environment :: Console -# Environment :: OpenStack -# Intended Audience :: Developers -# Intended Audience :: Information Technology -# License :: OSI Approved :: Apache Software License -# Operating System :: OS Independent -# Programming Language :: Python keywords = - bluecopper + proteininference [files] packages = pyproteininference -#data_files = -# etc/bluecopper/conf.d = peptide_glossary.ini.dist -[entry_points] -#paste.app_factory = -# main = peptide_glossary:main -#bluecopper_module = -# {cookiecutter.app_name}} = peptide_glossary.bluecopper_module:module_factory [pbr] -#autodoc_tree_index_modules = True -#autodoc_index_modules = True +autodoc_tree_index_modules = True +autodoc_index_modules = True [devpi:upload] no-vcs = 1 with-docs = 1 -#[build_sphinx] -#all_files = 1 -#build-dir = doc/build -#source-dir = doc/source - [flake8] max-line-length = 120 exclude = .git,.tox,.venv,doc,migrations \ No newline at end of file diff --git a/setup.py b/setup.py index 991e1d4..950ba7b 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,32 @@ from setuptools import setup import glob +with open("README.md", "r") as fh: + long_description = fh.read() + setup( setup_requires=["pbr>=1.8", "setuptools>=17.1"], pbr=True, scripts=glob.glob("scripts/*.py"), name="pyproteininference", - version="0.9.3", - url="", - license="", - author="hinklet", + version="1.0.0", + url="https://github.com/thinkle12/pyproteininference", + license="Apache-2", + author="Trent Hinkle", author_email="hinklet@gene.com", - description="Python Package for running custom Protein Inference on tab-formatted tandem MS/MS search results", + description="Python Package for running protein inference algorithms on tab-formatted tandem MS/MS search results.", + keywords=['protein inference', 'proteomics', 'mass spectrometry'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + ], + long_description=long_description, + long_description_content_type='text/markdown', ) diff --git a/tests/data/test_data_many_alternative_proteins.txt b/tests/data/test_data_many_alternative_proteins.txt new file mode 100644 index 0000000..64ad6ef --- /dev/null +++ b/tests/data/test_data_many_alternative_proteins.txt @@ -0,0 +1,3 @@ +PSMId score q-value posterior_error_prob peptide proteinIds +1 7 0.00032 3.50E-06 R.CQTCGYKFHEHCSTK.V Protein1 Protein2 Protein3 Protein4 Protein5 Protein6 Protein7 Protein8 Protein9 Protein10 Protein11 Protein12 Protein13 Protein14 Protein15 Protein16 Protein17 Protein18 Protein19 Protein20 Protein21 Protein22 Protein23 Protein24 Protein25 Protein26 Protein27 Protein28 Protein29 Protein30 Protein31 Protein32 Protein33 Protein34 Protein35 Protein36 Protein37 Protein38 Protein39 Protein40 Protein41 Protein42 Protein43 Protein44 Protein45 Protein46 Protein47 Protein48 Protein54 Protein50 Protein51 Protein52 Protein53 Protein49 Protein55 Protein56 Protein57 Protein58 Protein59 Protein60 Protein61 Protein62 Protein63 Protein64 Protein65 +27 0.93 0.03 9.00E-02 R.MEPTPVPFCGAK.L ##Protein65 ##Protein2 ##Protein3 ##Protein4 ##Protein5 ##Protein6 ##Protein7 ##Protein8 ##Protein9 ##Protein10 ##Protein11 ##Protein12 ##Protein13 ##Protein14 ##Protein15 ##Protein16 ##Protein17 ##Protein18 ##Protein19 ##Protein20 ##Protein21 ##Protein22 ##Protein23 ##Protein24 ##Protein25 ##Protein26 ##Protein27 ##Protein28 ##Protein29 ##Protein30 ##Protein31 ##Protein32 ##Protein33 ##Protein34 ##Protein35 ##Protein36 ##Protein37 ##Protein38 ##Protein39 ##Protein40 ##Protein41 ##Protein42 ##Protein43 ##Protein44 ##Protein45 ##Protein46 ##Protein47 ##Protein48 ##Protein49 ##Protein50 ##Protein51 ##Protein52 ##Protein53 ##Protein54 ##Protein55 ##Protein56 ##Protein57 ##Protein58 ##Protein59 ##Protein60 ##Protein61 ##Protein62 ##Protein63 ##Protein64 ##Protein1 \ No newline at end of file diff --git a/tests/data/test_params_additive_custom_score.yaml b/tests/data/test_params_additive_custom_score.yaml index be14264..811e6fa 100644 --- a/tests/data/test_params_additive_custom_score.yaml +++ b/tests/data/test_params_additive_custom_score.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: None - glpk_path: None shared_peptides: None peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_exclusion.yaml b/tests/data/test_params_exclusion.yaml index 6101bc4..215688f 100644 --- a/tests/data/test_params_exclusion.yaml +++ b/tests/data/test_params_exclusion.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: None - glpk_path: None shared_peptides: None peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_parsimony_glpk.yaml b/tests/data/test_params_heuristic.yaml similarity index 90% rename from tests/data/test_params_parsimony_glpk.yaml rename to tests/data/test_params_heuristic.yaml index 11b92e8..1911695 100644 --- a/tests/data/test_params_parsimony_glpk.yaml +++ b/tests/data/test_params_heuristic.yaml @@ -3,7 +3,7 @@ parameters: export: q_value fdr: 0.01 picker: True - tag: test_parsimony + tag: test_heuristic data_restriction: pep_restriction: .9 peptide_length_restriction: 7 @@ -24,8 +24,7 @@ parameters: digest_type: trypsin missed_cleavages: 3 parsimony: - lp_solver: glpk - glpk_path: glpsol + lp_solver: pulp shared_peptides: all peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_inclusion.yaml b/tests/data/test_params_inclusion.yaml index 4d5751c..c8e2a24 100644 --- a/tests/data/test_params_inclusion.yaml +++ b/tests/data/test_params_inclusion.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: None - glpk_path: glpsol shared_peptides: None peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_multiplicative_custom_score.yaml b/tests/data/test_params_multiplicative_custom_score.yaml index 666503e..12d0f8c 100644 --- a/tests/data/test_params_multiplicative_custom_score.yaml +++ b/tests/data/test_params_multiplicative_custom_score.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: None - glpk_path: None shared_peptides: None peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_no_inference.yaml b/tests/data/test_params_no_inference.yaml index 453bc5f..5183aca 100644 --- a/tests/data/test_params_no_inference.yaml +++ b/tests/data/test_params_no_inference.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: None - glpk_path: None shared_peptides: None peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_parsimony_pulp.yaml b/tests/data/test_params_parsimony_pulp.yaml index 3849092..1286b0c 100644 --- a/tests/data/test_params_parsimony_pulp.yaml +++ b/tests/data/test_params_parsimony_pulp.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: pulp - glpk_path: None shared_peptides: all peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_parsimony_pulp_none_types.yaml b/tests/data/test_params_parsimony_pulp_none_types.yaml index 15551e3..ece38db 100644 --- a/tests/data/test_params_parsimony_pulp_none_types.yaml +++ b/tests/data/test_params_parsimony_pulp_none_types.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: pulp - glpk_path: None shared_peptides: all peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_params_peptide_centric.yaml b/tests/data/test_params_peptide_centric.yaml index ca7ad4c..44ffad4 100644 --- a/tests/data/test_params_peptide_centric.yaml +++ b/tests/data/test_params_peptide_centric.yaml @@ -25,7 +25,6 @@ parameters: missed_cleavages: 3 parsimony: lp_solver: None - glpk_path: None shared_peptides: None peptide_centric: max_identifiers: 5 \ No newline at end of file diff --git a/tests/data/test_perc_data_target_no_alt_prot.txt b/tests/data/test_perc_data_target_no_alt_prot.txt index 00efb06..443e4bc 100644 --- a/tests/data/test_perc_data_target_no_alt_prot.txt +++ b/tests/data/test_perc_data_target_no_alt_prot.txt @@ -11,16 +11,16 @@ PSMId score q-value posterior_error_prob peptide proteinIds 10 5 0.001 1.56E-04 R.YCWMSTGLYIPGR.Q TCAF1_HUMAN|Q9Y4C2 11 4 0.002 2.23E-04 K.AEGGGGGGRPGAPAAGDGK.T HNRPU_HUMAN|Q00839 12 3 0.003 1.00E-03 R.LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR.S HNRPU_HUMAN|Q00839 -13 3 0.003 1.00E-03 K.CGVEVTQTK.V RPOC_SHIF8|Q0SY12 -14 3 0.003 1.00E-03 K.IALASPDMIR.S RPOC_SHIF8|Q0SY12 -15 3 0.003 1.00E-03 K.MGAEAIQALLK.S RPOC_SHIF8|Q0SY12 -16 3 0.003 1.00E-03 K.RVDYSGR.S RPOC_SHIF8|Q0SY12 -17 3 0.003 1.00E-03 K.VIDIWAAANDR.V RPOC_SHIF8|Q0SY12 -18 3 0.003 1.00E-03 R.EGLNVLQY#FISTHGAR.K RPOC_SHIF8|Q0SY12 -19 3 0.003 1.00E-03 R.FATSDLNDLYR.R RPOC_SHIF8|Q0SY12 -20 3 0.003 1.00E-03 R.IPQESGGTK.D RPOC_SHIF8|Q0SY12 -21 3 0.003 1.00E-03 R.LIPAGTGYAYHQDR.M RPOC_SHIF8|Q0SY12 -22 3 0.003 1.00E-03 R.NTLLHEQWCDLLEENSVDAVK.V RPOC_SHIF8|Q0SY12 -23 3 0.003 1.00E-03 R.VADLFEAR.R RPOC_SHIF8|Q0SY12 -24 3 0.003 1.00E-03 R.VTAEDVLKPGTADILVPR.N RPOC_SHIF8|Q0SY12 -25 0.1 0.8 9.50E-01 R.TVVNVR.N RAF1_HUMAN|P04049 RAF1_HUMAN|P04049 \ No newline at end of file +13 3 0.003 1.00E-03 K.CGVEVTQTK.V RPOC_SHIF8|Q0SY12 +14 3 0.003 1.00E-03 K.IALASPDMIR.S RPOC_SHIF8|Q0SY12 +15 3 0.003 1.00E-03 K.MGAEAIQALLK.S RPOC_SHIF8|Q0SY12 +16 3 0.003 1.00E-03 K.RVDYSGR.S RPOC_SHIF8|Q0SY12 +17 3 0.003 1.00E-03 K.VIDIWAAANDR.V RPOC_SHIF8|Q0SY12 +18 3 0.003 1.00E-03 R.EGLNVLQY#FISTHGAR.K RPOC_SHIF8|Q0SY12 +19 3 0.003 1.00E-03 R.FATSDLNDLYR.R RPOC_SHIF8|Q0SY12 +20 3 0.003 1.00E-03 R.IPQESGGTK.D RPOC_SHIF8|Q0SY12 +21 3 0.003 1.00E-03 R.LIPAGTGYAYHQDR.M RPOC_SHIF8|Q0SY12 +22 3 0.003 1.00E-03 R.NTLLHEQWCDLLEENSVDAVK.V RPOC_SHIF8|Q0SY12 +23 3 0.003 1.00E-03 R.VADLFEAR.R RPOC_SHIF8|Q0SY12 +24 3 0.003 1.00E-03 R.VTAEDVLKPGTADILVPR.N RPOC_SHIF8|Q0SY12 +25 0.1 0.8 9.50E-01 R.TVVNVR.N RAF1_HUMAN|P04049 \ No newline at end of file diff --git a/tests/glpkinout/glpkin_test_parsimony.mod b/tests/glpkinout/glpkin_test_parsimony.mod deleted file mode 100644 index 9ed4dfd..0000000 --- a/tests/glpkinout/glpkin_test_parsimony.mod +++ /dev/null @@ -1,40 +0,0 @@ -/* sets */ -set PROTEINS; - - -/* decision variables: yi, i in {1,..,5}. yi = 1 -> protein i is selected */ -var y {i in PROTEINS} binary >=0; -/* objective function */ -minimize z: sum{i in PROTEINS} y[i]; - -/* Constraints */ -s.t. c1: y[6] >=1; -s.t. c2: y[3] >=1; -s.t. c3: y[2] >=1; -s.t. c4: y[3] >=1; -s.t. c5: y[3] >=1; -s.t. c6: y[2] >=1; -s.t. c7: y[0]+y[1] >=1; -s.t. c8: y[3] >=1; -s.t. c9: y[3] >=1; -s.t. c10: y[3] >=1; -s.t. c11: y[6]+y[7] >=1; -s.t. c12: y[4] >=1; -s.t. c13: y[5] >=1; -s.t. c14: y[3] >=1; -s.t. c15: y[3] >=1; -s.t. c16: y[0]+y[1]+y[2] >=1; -s.t. c17: y[3] >=1; -s.t. c18: y[0]+y[2] >=1; -s.t. c19: y[0] >=1; -s.t. c20: y[3] >=1; -s.t. c21: y[1]+y[2] >=1; -s.t. c22: y[3] >=1; -s.t. c23: y[3] >=1; -s.t. c24: y[2] >=1; -s.t. c25: y[4] >=1; - -data; -set PROTEINS := 5 0 6 1 7 2 3 4 ; - -end; \ No newline at end of file diff --git a/tests/glpkinout/glpkout_test_parsimony.sol b/tests/glpkinout/glpkout_test_parsimony.sol deleted file mode 100644 index 9475f76..0000000 --- a/tests/glpkinout/glpkout_test_parsimony.sol +++ /dev/null @@ -1,58 +0,0 @@ -Problem: glpkin_test_parsimony -Rows: 26 -Columns: 8 (8 integer, 8 binary) -Non-zeros: 39 -Status: INTEGER OPTIMAL -Objective: z = 6 (MINimum) - - No. Row name Activity Lower bound Upper bound ------- ------------ ------------- ------------- ------------- - 1 z 6 - 2 c1 1 1 - 3 c2 1 1 - 4 c3 1 1 - 5 c4 1 1 - 6 c5 1 1 - 7 c6 1 1 - 8 c7 1 1 - 9 c8 1 1 - 10 c9 1 1 - 11 c10 1 1 - 12 c11 1 1 - 13 c12 1 1 - 14 c13 1 1 - 15 c14 1 1 - 16 c15 1 1 - 17 c16 2 1 - 18 c17 1 1 - 19 c18 2 1 - 20 c19 1 1 - 21 c20 1 1 - 22 c21 1 1 - 23 c22 1 1 - 24 c23 1 1 - 25 c24 1 1 - 26 c25 1 1 - - No. Column name Activity Lower bound Upper bound ------- ------------ ------------- ------------- ------------- - 1 y[6] * 1 0 1 - 2 y[0] * 1 0 1 - 3 y[1] * 0 0 1 - 4 y[2] * 1 0 1 - 5 y[7] * 0 0 1 - 6 y[3] * 1 0 1 - 7 y[4] * 1 0 1 - 8 y[5] * 1 0 1 - -Integer feasibility conditions: - -KKT.PE: max.abs.err = 0.00e+00 on row 0 - max.rel.err = 0.00e+00 on row 0 - High quality - -KKT.PB: max.abs.err = 0.00e+00 on row 0 - max.rel.err = 0.00e+00 on row 0 - High quality - -End of output diff --git a/tests/glpkinout/glpkout_test_parsimony_no_grouping.sol b/tests/glpkinout/glpkout_test_parsimony_no_grouping.sol deleted file mode 100644 index 9475f76..0000000 --- a/tests/glpkinout/glpkout_test_parsimony_no_grouping.sol +++ /dev/null @@ -1,58 +0,0 @@ -Problem: glpkin_test_parsimony -Rows: 26 -Columns: 8 (8 integer, 8 binary) -Non-zeros: 39 -Status: INTEGER OPTIMAL -Objective: z = 6 (MINimum) - - No. Row name Activity Lower bound Upper bound ------- ------------ ------------- ------------- ------------- - 1 z 6 - 2 c1 1 1 - 3 c2 1 1 - 4 c3 1 1 - 5 c4 1 1 - 6 c5 1 1 - 7 c6 1 1 - 8 c7 1 1 - 9 c8 1 1 - 10 c9 1 1 - 11 c10 1 1 - 12 c11 1 1 - 13 c12 1 1 - 14 c13 1 1 - 15 c14 1 1 - 16 c15 1 1 - 17 c16 2 1 - 18 c17 1 1 - 19 c18 2 1 - 20 c19 1 1 - 21 c20 1 1 - 22 c21 1 1 - 23 c22 1 1 - 24 c23 1 1 - 25 c24 1 1 - 26 c25 1 1 - - No. Column name Activity Lower bound Upper bound ------- ------------ ------------- ------------- ------------- - 1 y[6] * 1 0 1 - 2 y[0] * 1 0 1 - 3 y[1] * 0 0 1 - 4 y[2] * 1 0 1 - 5 y[7] * 0 0 1 - 6 y[3] * 1 0 1 - 7 y[4] * 1 0 1 - 8 y[5] * 1 0 1 - -Integer feasibility conditions: - -KKT.PE: max.abs.err = 0.00e+00 on row 0 - max.rel.err = 0.00e+00 on row 0 - High quality - -KKT.PB: max.abs.err = 0.00e+00 on row 0 - max.rel.err = 0.00e+00 on row 0 - High quality - -End of output diff --git a/tests/glpkinout/glpkout_test_parsimony_subset_peptides.sol b/tests/glpkinout/glpkout_test_parsimony_subset_peptides.sol deleted file mode 100644 index 9475f76..0000000 --- a/tests/glpkinout/glpkout_test_parsimony_subset_peptides.sol +++ /dev/null @@ -1,58 +0,0 @@ -Problem: glpkin_test_parsimony -Rows: 26 -Columns: 8 (8 integer, 8 binary) -Non-zeros: 39 -Status: INTEGER OPTIMAL -Objective: z = 6 (MINimum) - - No. Row name Activity Lower bound Upper bound ------- ------------ ------------- ------------- ------------- - 1 z 6 - 2 c1 1 1 - 3 c2 1 1 - 4 c3 1 1 - 5 c4 1 1 - 6 c5 1 1 - 7 c6 1 1 - 8 c7 1 1 - 9 c8 1 1 - 10 c9 1 1 - 11 c10 1 1 - 12 c11 1 1 - 13 c12 1 1 - 14 c13 1 1 - 15 c14 1 1 - 16 c15 1 1 - 17 c16 2 1 - 18 c17 1 1 - 19 c18 2 1 - 20 c19 1 1 - 21 c20 1 1 - 22 c21 1 1 - 23 c22 1 1 - 24 c23 1 1 - 25 c24 1 1 - 26 c25 1 1 - - No. Column name Activity Lower bound Upper bound ------- ------------ ------------- ------------- ------------- - 1 y[6] * 1 0 1 - 2 y[0] * 1 0 1 - 3 y[1] * 0 0 1 - 4 y[2] * 1 0 1 - 5 y[7] * 0 0 1 - 6 y[3] * 1 0 1 - 7 y[4] * 1 0 1 - 8 y[5] * 1 0 1 - -Integer feasibility conditions: - -KKT.PE: max.abs.err = 0.00e+00 on row 0 - max.rel.err = 0.00e+00 on row 0 - High quality - -KKT.PB: max.abs.err = 0.00e+00 on row 0 - max.rel.err = 0.00e+00 on row 0 - High quality - -End of output diff --git a/tests/test_001_parsimony_pipeline_glpk.py b/tests/test_001_parsimony_pipeline_glpk.py deleted file mode 100644 index e7eee4b..0000000 --- a/tests/test_001_parsimony_pipeline_glpk.py +++ /dev/null @@ -1,924 +0,0 @@ -import csv -import os -import tempfile -from unittest import TestCase - -from pkg_resources import resource_filename - -import pyproteininference -from pyproteininference import in_silico_digest -from pyproteininference.parameters import ProteinInferenceParameter - -TEST_DATABASE = resource_filename("pyproteininference", "../tests/data/test_database.fasta") -TARGET_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_target.txt") -DECOY_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_decoy.txt") -PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_parsimony_glpk.yaml") -OUTPUT_DIR = tempfile.gettempdir() -# OUTPUT_DIR = resource_filename('pyproteininference', '../tests/output/') -for sub_dir in ["leads", "all", "peptides", "psms", "psm_ids"]: - if not os.path.exists(os.path.join(OUTPUT_DIR, sub_dir)): - os.makedirs(os.path.join(OUTPUT_DIR, sub_dir)) - -GLPKINOUT_PATH = resource_filename("pyproteininference", "../tests/glpkinout/") -SKIP_RUNNING_GLPK = True - -LEAD_OUTPUT_FILE = resource_filename( - "pyproteininference", - "../tests/output/leads/test_parsimony_q_value_leads_ml_posterior_error_prob.csv", -) -ALL_OUTPUT_FILE = resource_filename( - "pyproteininference", - "../tests/output/all/test_parsimony_q_value_all_ml_posterior_error_prob.csv", -) -PEPTIDE_OUTPUT_FILE = resource_filename( - "pyproteininference", - "../tests/output/peptides/test_parsimony_q_value_leads_peptides_ml_posterior_error_prob.csv", -) -PSM_OUTPUT_FILE = resource_filename( - "pyproteininference", - "../tests/output/psms/test_parsimony_q_value_leads_psms_ml_posterior_error_prob.csv", -) -PSM_ID_OUTPUT_FILE = resource_filename( - "pyproteininference", - "../tests/output/psm_ids/test_parsimony_q_value_leads_psm_ids_ml_posterior_error_prob.csv", -) - -LEAD_OUTPUT_FILE_SUBSET = resource_filename( - "pyproteininference", - "../tests/output/leads/test_parsimony_subset_peptides_q_value_leads_ml_posterior_error_prob.csv", -) -ALL_OUTPUT_FILE_SUBSET = resource_filename( - "pyproteininference", - "../tests/output/all/test_parsimony_subset_peptides_q_value_all_ml_posterior_error_prob.csv", -) -PEPTIDE_OUTPUT_FILE_SUBSET = resource_filename( - "pyproteininference", - "../tests/output/peptides/test_parsimony_subset_peptides_q_value_leads_peptides_ml_posterior_error_prob.csv", -) -PSM_OUTPUT_FILE_SUBSET = resource_filename( - "pyproteininference", - "../tests/output/psms/test_parsimony_subset_peptides_q_value_leads_psms_ml_posterior_error_prob.csv", -) -PSM_ID_OUTPUT_FILE_SUBSET = resource_filename( - "pyproteininference", - "../tests/output/psm_ids/test_parsimony_subset_peptides_q_value_leads_psm_ids_ml_posterior_error_prob.csv", -) - -LEAD_OUTPUT_FILE_NONE = resource_filename( - "pyproteininference", - "../tests/output/leads/test_parsimony_no_grouping_q_value_leads_ml_posterior_error_prob.csv", -) -ALL_OUTPUT_FILE_NONE = resource_filename( - "pyproteininference", - "../tests/output/all/test_parsimony_no_grouping_q_value_all_ml_posterior_error_prob.csv", -) -PEPTIDE_OUTPUT_FILE_NONE = resource_filename( - "pyproteininference", - "../tests/output/peptides/test_parsimony_no_grouping_q_value_leads_peptides_ml_posterior_error_prob.csv", -) -PSM_OUTPUT_FILE_NONE = resource_filename( - "pyproteininference", - "../tests/output/psms/test_parsimony_no_grouping_q_value_leads_psms_ml_posterior_error_prob.csv", -) -PSM_ID_OUTPUT_FILE_NONE = resource_filename( - "pyproteininference", - "../tests/output/psm_ids/test_parsimony_no_grouping_q_value_leads_psm_ids_ml_posterior_error_prob.csv", -) - -IDENTIFIER_INDEX = 0 -SCORE_INDEX = 1 -Q_VALUE_INDEX = 2 -GROUP_ID_INDEX = 5 -PEPTIDES_INDEX = 6 - - -class TestLoadParsimonyGlpkWorkflow(TestCase): - def test_workflow_parsimony_glpk(self): - - # STEP 1: Load parameter file # - # STEP 1: Load parameter file # - # STEP 1: Load parameter file # - protein_inference_parameters = ProteinInferenceParameter(yaml_param_filepath=PARAMETER_FILE) - - self.assertEqual(protein_inference_parameters.digest_type, "trypsin") - self.assertEqual(protein_inference_parameters.export, "q_value") - self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, "glpsol") - self.assertEqual(protein_inference_parameters.missed_cleavages, 3) - self.assertEqual(protein_inference_parameters.picker, True) - self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) - self.assertEqual(protein_inference_parameters.restrict_peptide_length, 7) - self.assertEqual(protein_inference_parameters.restrict_q, 0.9) - self.assertEqual(protein_inference_parameters.protein_score, "multiplicative_log") - self.assertEqual(protein_inference_parameters.psm_score, "posterior_error_prob") - self.assertEqual(protein_inference_parameters.psm_score_type, "multiplicative") - self.assertEqual(protein_inference_parameters.decoy_symbol, "##") - self.assertEqual(protein_inference_parameters.isoform_symbol, "-") - self.assertEqual(protein_inference_parameters.reviewed_identifier_symbol, "sp|") - self.assertEqual(protein_inference_parameters.inference_type, "parsimony") - self.assertEqual(protein_inference_parameters.tag, "test_parsimony") - self.assertEqual(protein_inference_parameters.grouping_type, "shared_peptides") - self.assertEqual(protein_inference_parameters.max_identifiers_peptide_centric, 5) - self.assertEqual(protein_inference_parameters.lp_solver, "glpk") - self.assertEqual(protein_inference_parameters.restrict_custom, None) - - # STEP 2: Start with running an In Silico Digestion # - # STEP 2: Start with running an In Silico Digestion # - # STEP 2: Start with running an In Silico Digestion # - digest = in_silico_digest.PyteomicsDigest( - database_path=TEST_DATABASE, - digest_type=protein_inference_parameters.digest_type, - missed_cleavages=protein_inference_parameters.missed_cleavages, - reviewed_identifier_symbol=protein_inference_parameters.reviewed_identifier_symbol, - max_peptide_length=protein_inference_parameters.restrict_peptide_length, - id_splitting=True, - ) - digest.digest_fasta_database() - - # STEP 3: Read PSM Data # - # STEP 3: Read PSM Data # - # STEP 3: Read PSM Data # - pep_and_prot_data = pyproteininference.reader.GenericReader( - target_file=TARGET_FILE, - decoy_file=DECOY_FILE, - parameter_file_object=protein_inference_parameters, - digest=digest, - append_alt_from_db=False, - ) - pep_and_prot_data.read_psms() - - self.assertEqual(len(pep_and_prot_data.psms), 27) - - # STEP 4: Initiate the datastore class # - # STEP 4: Initiate the datastore class # - # STEP 4: Initiate the datastore class # - data = pyproteininference.datastore.DataStore(pep_and_prot_data, digest=digest) - - # Step 5: Restrict the PSM data - # Step 5: Restrict the PSM data - # Step 5: Restrict the PSM data - data.restrict_psm_data() - - self.assertEqual(len(data.main_data_restricted), 26) - - # Step 6: Generate protein scoring input - # Step 6: Generate protein scoring input - # Step 6: Generate protein scoring input - data.create_scoring_input() - - # Step 7: Remove non unique peptides if running exclusion - # Step 7: Remove non unique peptides if running exclusion - # Step 7: Remove non unique peptides if running exclusion - if protein_inference_parameters.inference_type == pyproteininference.inference.Inference.EXCLUSION: - # This gets ran if we run exclusion... - data.exclude_non_distinguishing_peptides() - - # STEP 8: Score our PSMs given a score method - # STEP 8: Score our PSMs given a score method - # STEP 8: Score our PSMs given a score method - score = pyproteininference.scoring.Score(data=data) - score.score_psms(score_method=protein_inference_parameters.protein_score) - - # STEP 9: Run protein picker on the data - # STEP 9: Run protein picker on the data - # STEP 9: Run protein picker on the data - if protein_inference_parameters.picker: - data.protein_picker() - else: - pass - - # STEP 10: Apply Inference - # STEP 10: Apply Inference - # STEP 10: Apply Inference - inference_type = protein_inference_parameters.inference_type - - # For parsimony... Run GLPK setup, runner, grouper... - if inference_type == pyproteininference.inference.Inference.PARSIMONY: - group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) - - if inference_type == pyproteininference.inference.Inference.INCLUSION: - group = pyproteininference.inference.Inclusion(data=data, digest=digest) - group.infer_proteins() - - if inference_type == pyproteininference.inference.Inference.EXCLUSION: - group = pyproteininference.inference.Exclusion(data=data, digest=digest) - group.infer_proteins() - - # STEP 11: Run FDR and Q value Calculations - # STEP 11: Run FDR and Q value Calculations - # STEP 11: Run FDR and Q value Calculations - data.calculate_q_values() - - # STEP 12: Export to CSV - # STEP 12: Export to CSV - # STEP 12: Export to CSV - export_type = protein_inference_parameters.export - export = pyproteininference.export.Export(data=data) - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "leads"), export_type=export_type) - - lead_output = [] - with open(LEAD_OUTPUT_FILE, "r") as lead_output_file: - reader = csv.reader(lead_output_file, delimiter=",") - for row in reader: - lead_output.append(row) - - del lead_output[0] - - protein_groups = data.protein_group_objects - - for i in range(len(protein_groups)): - lead_protein = protein_groups[i].proteins[0] - self.assertEqual(lead_protein.identifier, lead_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(lead_protein.score, float(lead_output[i][SCORE_INDEX])) - self.assertEqual(protein_groups[i].q_value, float(lead_output[i][Q_VALUE_INDEX])) - self.assertEqual(protein_groups[i].number_id, int(lead_output[i][GROUP_ID_INDEX])) - self.assertEqual(lead_protein.peptides, set(lead_output[i][PEPTIDES_INDEX:])) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "all"), export_type="q_value_all") - - all_output = [] - with open(ALL_OUTPUT_FILE, "r") as all_output_file: - reader = csv.reader(all_output_file, delimiter=",") - for row in reader: - all_output.append(row) - - del all_output[0] - - all_output_new = [] - with open(export.filepath, "r") as all_output_file_new: - reader = csv.reader(all_output_file_new, delimiter=",") - for row in reader: - all_output_new.append(row) - - del all_output_new[0] - - for i in range(len(all_output)): - self.assertEqual(all_output_new[i][IDENTIFIER_INDEX], all_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(float(all_output_new[i][SCORE_INDEX]), float(all_output[i][SCORE_INDEX])) - self.assertEqual( - float(all_output_new[i][Q_VALUE_INDEX]), - float(all_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(all_output_new[i][GROUP_ID_INDEX]), - int(all_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(all_output_new[i][PEPTIDES_INDEX:]), - set(all_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "peptides"), export_type="peptides") - - peptide_output = [] - with open(PEPTIDE_OUTPUT_FILE, "r") as peptide_output_file: - reader = csv.reader(peptide_output_file, delimiter=",") - for row in reader: - peptide_output.append(row) - - del peptide_output[0] - - peptide_output_new = [] - with open(export.filepath, "r") as peptide_output_file_new: - reader = csv.reader(peptide_output_file_new, delimiter=",") - for row in reader: - peptide_output_new.append(row) - - del peptide_output_new[0] - - for i in range(len(peptide_output)): - self.assertEqual( - peptide_output_new[i][IDENTIFIER_INDEX], - peptide_output[i][IDENTIFIER_INDEX], - ) - self.assertAlmostEqual( - float(peptide_output_new[i][SCORE_INDEX]), - float(peptide_output[i][SCORE_INDEX]), - ) - self.assertEqual( - float(peptide_output_new[i][Q_VALUE_INDEX]), - float(peptide_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(peptide_output_new[i][GROUP_ID_INDEX]), - int(peptide_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(peptide_output_new[i][PEPTIDES_INDEX:]), - set(peptide_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "psms"), export_type="psms") - - psm_output = [] - with open(PSM_OUTPUT_FILE, "r") as psm_output_file: - reader = csv.reader(psm_output_file, delimiter=",") - for row in reader: - psm_output.append(row) - - del psm_output[0] - - psm_output_new = [] - with open(export.filepath, "r") as psm_output_file_new: - reader = csv.reader(psm_output_file_new, delimiter=",") - for row in reader: - psm_output_new.append(row) - - del psm_output_new[0] - - for i in range(len(psm_output)): - self.assertEqual(psm_output_new[i][IDENTIFIER_INDEX], psm_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(float(psm_output_new[i][SCORE_INDEX]), float(psm_output[i][SCORE_INDEX])) - self.assertEqual( - float(psm_output_new[i][Q_VALUE_INDEX]), - float(psm_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(psm_output_new[i][GROUP_ID_INDEX]), - int(psm_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(psm_output_new[i][PEPTIDES_INDEX:]), - set(psm_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "psm_ids"), export_type="psm_ids") - - psm_id_output = [] - with open(PSM_ID_OUTPUT_FILE, "r") as psm_id_output_file: - reader = csv.reader(psm_id_output_file, delimiter=",") - for row in reader: - psm_id_output.append(row) - - del psm_id_output[0] - - psm_id_output_new = [] - with open(export.filepath, "r") as psm_id_output_file_new: - reader = csv.reader(psm_id_output_file_new, delimiter=",") - for row in reader: - psm_id_output_new.append(row) - - del psm_id_output_new[0] - - for i in range(len(psm_id_output)): - self.assertEqual( - psm_id_output_new[i][IDENTIFIER_INDEX], - psm_id_output[i][IDENTIFIER_INDEX], - ) - self.assertAlmostEqual( - float(psm_id_output_new[i][SCORE_INDEX]), - float(psm_id_output[i][SCORE_INDEX]), - ) - self.assertEqual( - float(psm_id_output_new[i][Q_VALUE_INDEX]), - float(psm_id_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(psm_id_output_new[i][GROUP_ID_INDEX]), - int(psm_id_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(psm_id_output_new[i][PEPTIDES_INDEX:]), - set(psm_id_output[i][PEPTIDES_INDEX:]), - ) - - def test_workflow_parsimony_glpk_subset_peptides(self): - # RUN AGAIN WITH DIFFERENT GROUPING TYPE - - # STEP 1: Load parameter file # - # STEP 1: Load parameter file # - # STEP 1: Load parameter file # - protein_inference_parameters = ProteinInferenceParameter(yaml_param_filepath=PARAMETER_FILE) - - protein_inference_parameters.grouping_type = "subset_peptides" - protein_inference_parameters.tag = "test_parsimony_subset_peptides" - # STEP 2: Start with running an In Silico Digestion # - # STEP 2: Start with running an In Silico Digestion # - # STEP 2: Start with running an In Silico Digestion # - digest = in_silico_digest.PyteomicsDigest( - database_path=TEST_DATABASE, - digest_type=protein_inference_parameters.digest_type, - missed_cleavages=protein_inference_parameters.missed_cleavages, - reviewed_identifier_symbol=protein_inference_parameters.reviewed_identifier_symbol, - max_peptide_length=protein_inference_parameters.restrict_peptide_length, - id_splitting=True, - ) - digest.digest_fasta_database() - - # STEP 3: Read PSM Data # - # STEP 3: Read PSM Data # - # STEP 3: Read PSM Data # - pep_and_prot_data = pyproteininference.reader.GenericReader( - target_file=TARGET_FILE, - decoy_file=DECOY_FILE, - parameter_file_object=protein_inference_parameters, - digest=digest, - append_alt_from_db=False, - ) - pep_and_prot_data.read_psms() - - self.assertEqual(len(pep_and_prot_data.psms), 27) - - # STEP 4: Initiate the datastore class # - # STEP 4: Initiate the datastore class # - # STEP 4: Initiate the datastore class # - data = pyproteininference.datastore.DataStore(pep_and_prot_data, digest=digest) - - # Step 5: Restrict the PSM data - # Step 5: Restrict the PSM data - # Step 5: Restrict the PSM data - data.restrict_psm_data() - - self.assertEqual(len(data.main_data_restricted), 26) - - # Step 6: Generate protein scoring input - # Step 6: Generate protein scoring input - # Step 6: Generate protein scoring input - data.create_scoring_input() - - # Step 7: Remove non unique peptides if running exclusion - # Step 7: Remove non unique peptides if running exclusion - # Step 7: Remove non unique peptides if running exclusion - if protein_inference_parameters.inference_type == pyproteininference.inference.Inference.EXCLUSION: - # This gets ran if we run exclusion... - data.exclude_non_distinguishing_peptides() - - # STEP 8: Score our PSMs given a score method - # STEP 8: Score our PSMs given a score method - # STEP 8: Score our PSMs given a score method - score = pyproteininference.scoring.Score(data=data) - score.score_psms(score_method=protein_inference_parameters.protein_score) - - # STEP 9: Run protein picker on the data - # STEP 9: Run protein picker on the data - # STEP 9: Run protein picker on the data - if protein_inference_parameters.picker: - data.protein_picker() - else: - pass - - # STEP 10: Apply Inference - # STEP 10: Apply Inference - # STEP 10: Apply Inference - inference_type = protein_inference_parameters.inference_type - - # For parsimony... Run GLPK setup, runner, grouper... - if inference_type == pyproteininference.inference.Inference.PARSIMONY: - group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) - - if inference_type == pyproteininference.inference.Inference.INCLUSION: - group = pyproteininference.inference.Inclusion(data=data, digest=digest) - group.infer_proteins() - - if inference_type == pyproteininference.inference.Inference.EXCLUSION: - group = pyproteininference.inference.Exclusion(data=data, digest=digest) - group.infer_proteins() - - # STEP 11: Run FDR and Q value Calculations - # STEP 11: Run FDR and Q value Calculations - # STEP 11: Run FDR and Q value Calculations - data.calculate_q_values() - - # STEP 12: Export to CSV - # STEP 12: Export to CSV - # STEP 12: Export to CSV - export_type = protein_inference_parameters.export - export = pyproteininference.export.Export(data=data) - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "leads"), export_type=export_type) - lead_output = [] - with open(LEAD_OUTPUT_FILE_SUBSET, "r") as lead_output_file: - reader = csv.reader(lead_output_file, delimiter=",") - for row in reader: - lead_output.append(row) - - del lead_output[0] - - protein_groups = data.protein_group_objects - - for i in range(len(protein_groups)): - lead_protein = protein_groups[i].proteins[0] - self.assertEqual(lead_protein.identifier, lead_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(lead_protein.score, float(lead_output[i][SCORE_INDEX])) - self.assertEqual(protein_groups[i].q_value, float(lead_output[i][Q_VALUE_INDEX])) - self.assertEqual(protein_groups[i].number_id, int(lead_output[i][GROUP_ID_INDEX])) - self.assertEqual(lead_protein.peptides, set(lead_output[i][PEPTIDES_INDEX:])) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "all"), export_type="q_value_all") - - all_output = [] - with open(ALL_OUTPUT_FILE_SUBSET, "r") as all_output_file: - reader = csv.reader(all_output_file, delimiter=",") - for row in reader: - all_output.append(row) - - del all_output[0] - - all_output_new = [] - with open(export.filepath, "r") as all_output_file_new: - reader = csv.reader(all_output_file_new, delimiter=",") - for row in reader: - all_output_new.append(row) - - del all_output_new[0] - - for i in range(len(all_output)): - self.assertEqual(all_output_new[i][IDENTIFIER_INDEX], all_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(float(all_output_new[i][SCORE_INDEX]), float(all_output[i][SCORE_INDEX])) - self.assertEqual( - float(all_output_new[i][Q_VALUE_INDEX]), - float(all_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(all_output_new[i][GROUP_ID_INDEX]), - int(all_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(all_output_new[i][PEPTIDES_INDEX:]), - set(all_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "peptides"), export_type="peptides") - - peptide_output = [] - with open(PEPTIDE_OUTPUT_FILE_SUBSET, "r") as peptide_output_file: - reader = csv.reader(peptide_output_file, delimiter=",") - for row in reader: - peptide_output.append(row) - - del peptide_output[0] - - peptide_output_new = [] - with open(export.filepath, "r") as peptide_output_file_new: - reader = csv.reader(peptide_output_file_new, delimiter=",") - for row in reader: - peptide_output_new.append(row) - - del peptide_output_new[0] - - for i in range(len(peptide_output)): - self.assertEqual( - peptide_output_new[i][IDENTIFIER_INDEX], - peptide_output[i][IDENTIFIER_INDEX], - ) - self.assertAlmostEqual( - float(peptide_output_new[i][SCORE_INDEX]), - float(peptide_output[i][SCORE_INDEX]), - ) - self.assertEqual( - float(peptide_output_new[i][Q_VALUE_INDEX]), - float(peptide_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(peptide_output_new[i][GROUP_ID_INDEX]), - int(peptide_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(peptide_output_new[i][PEPTIDES_INDEX:]), - set(peptide_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "psms"), export_type="psms") - - psm_output = [] - with open(PSM_OUTPUT_FILE_SUBSET, "r") as psm_output_file: - reader = csv.reader(psm_output_file, delimiter=",") - for row in reader: - psm_output.append(row) - - del psm_output[0] - - psm_output_new = [] - with open(export.filepath, "r") as psm_output_file_new: - reader = csv.reader(psm_output_file_new, delimiter=",") - for row in reader: - psm_output_new.append(row) - - del psm_output_new[0] - - for i in range(len(psm_output)): - self.assertEqual(psm_output_new[i][IDENTIFIER_INDEX], psm_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(float(psm_output_new[i][SCORE_INDEX]), float(psm_output[i][SCORE_INDEX])) - self.assertEqual( - float(psm_output_new[i][Q_VALUE_INDEX]), - float(psm_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(psm_output_new[i][GROUP_ID_INDEX]), - int(psm_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(psm_output_new[i][PEPTIDES_INDEX:]), - set(psm_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "psm_ids"), export_type="psm_ids") - - psm_id_output = [] - with open(PSM_ID_OUTPUT_FILE_SUBSET, "r") as psm_id_output_file: - reader = csv.reader(psm_id_output_file, delimiter=",") - for row in reader: - psm_id_output.append(row) - - del psm_id_output[0] - - psm_id_output_new = [] - with open(export.filepath, "r") as psm_id_output_file_new: - reader = csv.reader(psm_id_output_file_new, delimiter=",") - for row in reader: - psm_id_output_new.append(row) - - del psm_id_output_new[0] - - for i in range(len(psm_id_output)): - self.assertEqual( - psm_id_output_new[i][IDENTIFIER_INDEX], - psm_id_output[i][IDENTIFIER_INDEX], - ) - self.assertAlmostEqual( - float(psm_id_output_new[i][SCORE_INDEX]), - float(psm_id_output[i][SCORE_INDEX]), - ) - self.assertEqual( - float(psm_id_output_new[i][Q_VALUE_INDEX]), - float(psm_id_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(psm_id_output_new[i][GROUP_ID_INDEX]), - int(psm_id_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(psm_id_output_new[i][PEPTIDES_INDEX:]), - set(psm_id_output[i][PEPTIDES_INDEX:]), - ) - - def test_workflow_parsimony_glpk_no_grouping(self): - # NOW RUN WITH NO GROUPING - - # STEP 1: Load parameter file # - # STEP 1: Load parameter file # - # STEP 1: Load parameter file # - protein_inference_parameters = ProteinInferenceParameter(yaml_param_filepath=PARAMETER_FILE) - - protein_inference_parameters.grouping_type = None - protein_inference_parameters.tag = "test_parsimony_no_grouping" - # STEP 2: Start with running an In Silico Digestion # - # STEP 2: Start with running an In Silico Digestion # - # STEP 2: Start with running an In Silico Digestion # - digest = in_silico_digest.PyteomicsDigest( - database_path=TEST_DATABASE, - digest_type=protein_inference_parameters.digest_type, - missed_cleavages=protein_inference_parameters.missed_cleavages, - reviewed_identifier_symbol=protein_inference_parameters.reviewed_identifier_symbol, - max_peptide_length=protein_inference_parameters.restrict_peptide_length, - id_splitting=True, - ) - digest.digest_fasta_database() - - # STEP 3: Read PSM Data # - # STEP 3: Read PSM Data # - # STEP 3: Read PSM Data # - pep_and_prot_data = pyproteininference.reader.GenericReader( - target_file=TARGET_FILE, - decoy_file=DECOY_FILE, - parameter_file_object=protein_inference_parameters, - digest=digest, - append_alt_from_db=False, - ) - pep_and_prot_data.read_psms() - - self.assertEqual(len(pep_and_prot_data.psms), 27) - - # STEP 4: Initiate the datastore class # - # STEP 4: Initiate the datastore class # - # STEP 4: Initiate the datastore class # - data = pyproteininference.datastore.DataStore(pep_and_prot_data, digest=digest) - - # Step 5: Restrict the PSM data - # Step 5: Restrict the PSM data - # Step 5: Restrict the PSM data - data.restrict_psm_data() - - self.assertEqual(len(data.main_data_restricted), 26) - - # Step 6: Generate protein scoring input - # Step 6: Generate protein scoring input - # Step 6: Generate protein scoring input - data.create_scoring_input() - - # Step 7: Remove non unique peptides if running exclusion - # Step 7: Remove non unique peptides if running exclusion - # Step 7: Remove non unique peptides if running exclusion - if protein_inference_parameters.inference_type == pyproteininference.inference.Inference.EXCLUSION: - # This gets ran if we run exclusion... - data.exclude_non_distinguishing_peptides() - - # STEP 8: Score our PSMs given a score method - # STEP 8: Score our PSMs given a score method - # STEP 8: Score our PSMs given a score method - score = pyproteininference.scoring.Score(data=data) - score.score_psms(score_method=protein_inference_parameters.protein_score) - - # STEP 9: Run protein picker on the data - # STEP 9: Run protein picker on the data - # STEP 9: Run protein picker on the data - if protein_inference_parameters.picker: - data.protein_picker() - else: - pass - - # STEP 10: Apply Inference - # STEP 10: Apply Inference - # STEP 10: Apply Inference - inference_type = protein_inference_parameters.inference_type - - # For parsimony... Run GLPK setup, runner, grouper... - if inference_type == pyproteininference.inference.Inference.PARSIMONY: - group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) - - if inference_type == pyproteininference.inference.Inference.INCLUSION: - group = pyproteininference.inference.Inclusion(data=data, digest=digest) - group.infer_proteins() - - if inference_type == pyproteininference.inference.Inference.EXCLUSION: - group = pyproteininference.inference.Exclusion(data=data, digest=digest) - group.infer_proteins() - - # STEP 11: Run FDR and Q value Calculations - # STEP 11: Run FDR and Q value Calculations - # STEP 11: Run FDR and Q value Calculations - data.calculate_q_values() - - # STEP 12: Export to CSV - # STEP 12: Export to CSV - # STEP 12: Export to CSV - export_type = protein_inference_parameters.export - export = pyproteininference.export.Export(data=data) - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "leads"), export_type=export_type) - lead_output = [] - with open(LEAD_OUTPUT_FILE_NONE, "r") as lead_output_file: - reader = csv.reader(lead_output_file, delimiter=",") - for row in reader: - lead_output.append(row) - - del lead_output[0] - - protein_groups = data.protein_group_objects - - for i in range(len(protein_groups)): - lead_protein = protein_groups[i].proteins[0] - self.assertEqual(lead_protein.identifier, lead_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(lead_protein.score, float(lead_output[i][SCORE_INDEX])) - self.assertEqual(protein_groups[i].q_value, float(lead_output[i][Q_VALUE_INDEX])) - self.assertEqual(protein_groups[i].number_id, int(lead_output[i][GROUP_ID_INDEX])) - self.assertEqual(lead_protein.peptides, set(lead_output[i][PEPTIDES_INDEX:])) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "all"), export_type="q_value_all") - - all_output = [] - with open(ALL_OUTPUT_FILE_NONE, "r") as all_output_file: - reader = csv.reader(all_output_file, delimiter=",") - for row in reader: - all_output.append(row) - - del all_output[0] - - all_output_new = [] - with open(export.filepath, "r") as all_output_file_new: - reader = csv.reader(all_output_file_new, delimiter=",") - for row in reader: - all_output_new.append(row) - - del all_output_new[0] - - for i in range(len(all_output)): - self.assertEqual(all_output_new[i][IDENTIFIER_INDEX], all_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(float(all_output_new[i][SCORE_INDEX]), float(all_output[i][SCORE_INDEX])) - self.assertEqual( - float(all_output_new[i][Q_VALUE_INDEX]), - float(all_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(all_output_new[i][GROUP_ID_INDEX]), - int(all_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(all_output_new[i][PEPTIDES_INDEX:]), - set(all_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "peptides"), export_type="peptides") - - peptide_output = [] - with open(PEPTIDE_OUTPUT_FILE_NONE, "r") as peptide_output_file: - reader = csv.reader(peptide_output_file, delimiter=",") - for row in reader: - peptide_output.append(row) - - del peptide_output[0] - - peptide_output_new = [] - with open(export.filepath, "r") as peptide_output_file_new: - reader = csv.reader(peptide_output_file_new, delimiter=",") - for row in reader: - peptide_output_new.append(row) - - del peptide_output_new[0] - - for i in range(len(peptide_output)): - self.assertEqual( - peptide_output_new[i][IDENTIFIER_INDEX], - peptide_output[i][IDENTIFIER_INDEX], - ) - self.assertAlmostEqual( - float(peptide_output_new[i][SCORE_INDEX]), - float(peptide_output[i][SCORE_INDEX]), - ) - self.assertEqual( - float(peptide_output_new[i][Q_VALUE_INDEX]), - float(peptide_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(peptide_output_new[i][GROUP_ID_INDEX]), - int(peptide_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(peptide_output_new[i][PEPTIDES_INDEX:]), - set(peptide_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "psms"), export_type="psms") - - psm_output = [] - with open(PSM_OUTPUT_FILE_NONE, "r") as psm_output_file: - reader = csv.reader(psm_output_file, delimiter=",") - for row in reader: - psm_output.append(row) - - del psm_output[0] - - psm_output_new = [] - with open(export.filepath, "r") as psm_output_file_new: - reader = csv.reader(psm_output_file_new, delimiter=",") - for row in reader: - psm_output_new.append(row) - - del psm_output_new[0] - - for i in range(len(psm_output)): - self.assertEqual(psm_output_new[i][IDENTIFIER_INDEX], psm_output[i][IDENTIFIER_INDEX]) - self.assertAlmostEqual(float(psm_output_new[i][SCORE_INDEX]), float(psm_output[i][SCORE_INDEX])) - self.assertEqual( - float(psm_output_new[i][Q_VALUE_INDEX]), - float(psm_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(psm_output_new[i][GROUP_ID_INDEX]), - int(psm_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(psm_output_new[i][PEPTIDES_INDEX:]), - set(psm_output[i][PEPTIDES_INDEX:]), - ) - - export.export_to_csv(directory=os.path.join(OUTPUT_DIR, "psm_ids"), export_type="psm_ids") - - psm_id_output = [] - with open(PSM_ID_OUTPUT_FILE_NONE, "r") as psm_id_output_file: - reader = csv.reader(psm_id_output_file, delimiter=",") - for row in reader: - psm_id_output.append(row) - - del psm_id_output[0] - - psm_id_output_new = [] - with open(export.filepath, "r") as psm_id_output_file_new: - reader = csv.reader(psm_id_output_file_new, delimiter=",") - for row in reader: - psm_id_output_new.append(row) - - del psm_id_output_new[0] - - for i in range(len(psm_id_output)): - self.assertEqual( - psm_id_output_new[i][IDENTIFIER_INDEX], - psm_id_output[i][IDENTIFIER_INDEX], - ) - self.assertAlmostEqual( - float(psm_id_output_new[i][SCORE_INDEX]), - float(psm_id_output[i][SCORE_INDEX]), - ) - self.assertEqual( - float(psm_id_output_new[i][Q_VALUE_INDEX]), - float(psm_id_output[i][Q_VALUE_INDEX]), - ) - self.assertEqual( - int(psm_id_output_new[i][GROUP_ID_INDEX]), - int(psm_id_output[i][GROUP_ID_INDEX]), - ) - self.assertEqual( - set(psm_id_output_new[i][PEPTIDES_INDEX:]), - set(psm_id_output[i][PEPTIDES_INDEX:]), - ) diff --git a/tests/test_002_inclusion_pipeline.py b/tests/test_002_inclusion_pipeline.py index 92e94d7..fee86dd 100644 --- a/tests/test_002_inclusion_pipeline.py +++ b/tests/test_002_inclusion_pipeline.py @@ -58,7 +58,6 @@ def test_workflow_inclusion(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, "glpsol") self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -146,7 +145,6 @@ def test_workflow_inclusion(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) group.infer_proteins() diff --git a/tests/test_003_exclusion_pipeline.py b/tests/test_003_exclusion_pipeline.py index 27e6d79..cf06a24 100644 --- a/tests/test_003_exclusion_pipeline.py +++ b/tests/test_003_exclusion_pipeline.py @@ -58,7 +58,6 @@ def test_workflow_exclusion(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, None) self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -146,7 +145,6 @@ def test_workflow_exclusion(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) group.infer_proteins() diff --git a/tests/test_004_gfy_type_no_inference_pipeline.py b/tests/test_004_gfy_type_no_inference_pipeline.py index 8752406..c7a3003 100644 --- a/tests/test_004_gfy_type_no_inference_pipeline.py +++ b/tests/test_004_gfy_type_no_inference_pipeline.py @@ -58,7 +58,6 @@ def test_workflow_no_inference(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, None) self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -146,7 +145,6 @@ def test_workflow_no_inference(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) group.infer_proteins() diff --git a/tests/test_005_parsimony_pipeline_pulp.py b/tests/test_005_parsimony_pipeline_pulp.py index ae9cf93..66210c2 100644 --- a/tests/test_005_parsimony_pipeline_pulp.py +++ b/tests/test_005_parsimony_pipeline_pulp.py @@ -90,7 +90,6 @@ class TestLoadParsimonyPulpWorkflow(TestCase): - # @unittest.skip("Skipping Pulp Test, No CBC executable in build env") def test_workflow_parsimony_pulp(self): # STEP 1: Load parameter file # @@ -101,7 +100,6 @@ def test_workflow_parsimony_pulp(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, None) self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -189,10 +187,9 @@ def test_workflow_parsimony_pulp(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=None, skip_running_glpk=None) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) @@ -380,8 +377,7 @@ def test_workflow_parsimony_pulp(self): set(psm_id_output[i][PEPTIDES_INDEX:]), ) - # @unittest.skip("Skipping Pulp Subset Peptides Test, No CBC executable in build env") - def test_workflow_parsimony_glpk_subset_peptides(self): + def test_workflow_parsimony_pulp_subset_peptides(self): # STEP 1: Load parameter file # # STEP 1: Load parameter file # @@ -460,10 +456,9 @@ def test_workflow_parsimony_glpk_subset_peptides(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=None, skip_running_glpk=None) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) @@ -650,8 +645,7 @@ def test_workflow_parsimony_glpk_subset_peptides(self): set(psm_id_output[i][PEPTIDES_INDEX:]), ) - # @unittest.skip("Skipping Pulp No Grouping Test, No CBC executable in build env") - def test_workflow_parsimony_glpk_no_grouping(self): + def test_workflow_parsimony_pulp_no_grouping(self): # NOW RUN WITH NO GROUPING # STEP 1: Load parameter file # @@ -731,10 +725,9 @@ def test_workflow_parsimony_glpk_no_grouping(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=None, skip_running_glpk=None) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) diff --git a/tests/test_006_peptide_centric_pipeline.py b/tests/test_006_peptide_centric_pipeline.py index 86e5a64..616db66 100644 --- a/tests/test_006_peptide_centric_pipeline.py +++ b/tests/test_006_peptide_centric_pipeline.py @@ -19,9 +19,6 @@ if not os.path.exists(os.path.join(OUTPUT_DIR, sub_dir)): os.makedirs(os.path.join(OUTPUT_DIR, sub_dir)) -GLPKINOUT_PATH = resource_filename("pyproteininference", "../tests/glpkinout/") -SKIP_RUNNING_GLPK = True - LEAD_OUTPUT_FILE = resource_filename( "pyproteininference", "../tests/output/leads/test_peptide_centric_q_value_leads_ml_posterior_error_prob.csv", @@ -61,7 +58,6 @@ def test_workflow_peptide_centric(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, None) self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -149,10 +145,9 @@ def test_workflow_peptide_centric(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) diff --git a/tests/test_009_additive_custom_pipeline.py b/tests/test_009_additive_custom_pipeline.py index 07d0d6b..4b56056 100644 --- a/tests/test_009_additive_custom_pipeline.py +++ b/tests/test_009_additive_custom_pipeline.py @@ -130,10 +130,9 @@ def test_workflow_additive_custom(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=None, skip_running_glpk=None) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) diff --git a/tests/test_010_multiplicative_custom_pipeline.py b/tests/test_010_multiplicative_custom_pipeline.py index 52a7aea..1dfa366 100644 --- a/tests/test_010_multiplicative_custom_pipeline.py +++ b/tests/test_010_multiplicative_custom_pipeline.py @@ -134,10 +134,9 @@ def test_workflow_multiplicative_custom(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=None, skip_running_glpk=None) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) diff --git a/tests/test_011_test_exports.py b/tests/test_011_test_exports.py index b46e998..a5f384e 100644 --- a/tests/test_011_test_exports.py +++ b/tests/test_011_test_exports.py @@ -9,11 +9,9 @@ TEST_DATABASE = resource_filename("pyproteininference", "../tests/data/test_database.fasta") TARGET_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_target.txt") DECOY_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_decoy.txt") -PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_parsimony_glpk.yaml") +PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_parsimony_pulp.yaml") OUTPUT_DIR = tempfile.gettempdir() # OUTPUT_DIR = resource_filename('pyproteininference', '../tests/output/') -GLPKINOUT_PATH = resource_filename("pyproteininference", "../tests/glpkinout/") -SKIP_RUNNING_GLPK = True class TestExportTypes(TestCase): @@ -29,7 +27,6 @@ def test_export_types(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, "glpsol") self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -45,7 +42,7 @@ def test_export_types(self): self.assertEqual(protein_inference_parameters.tag, "test_parsimony") self.assertEqual(protein_inference_parameters.grouping_type, "shared_peptides") self.assertEqual(protein_inference_parameters.max_identifiers_peptide_centric, 5) - self.assertEqual(protein_inference_parameters.lp_solver, "glpk") + self.assertEqual(protein_inference_parameters.lp_solver, "pulp") # STEP 2: Start with running an In Silico Digestion # # STEP 2: Start with running an In Silico Digestion # @@ -117,10 +114,9 @@ def test_export_types(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) diff --git a/tests/test_013_test_datastore_methods.py b/tests/test_013_test_datastore_methods.py index 231e54e..870c80c 100644 --- a/tests/test_013_test_datastore_methods.py +++ b/tests/test_013_test_datastore_methods.py @@ -93,7 +93,6 @@ def test_datastore(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) group.infer_proteins() diff --git a/tests/test_017_missing_database_pipeline.py b/tests/test_017_missing_database_pipeline.py index 1e9d1d0..ab4fb59 100644 --- a/tests/test_017_missing_database_pipeline.py +++ b/tests/test_017_missing_database_pipeline.py @@ -217,7 +217,6 @@ def test_missing_database_pipeline(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) group.infer_proteins() diff --git a/tests/test_020_parsimony_pipeline_pulp_none_type_params.py b/tests/test_020_parsimony_pipeline_pulp_none_type_params.py index d439f1b..3f82bac 100644 --- a/tests/test_020_parsimony_pipeline_pulp_none_type_params.py +++ b/tests/test_020_parsimony_pipeline_pulp_none_type_params.py @@ -19,9 +19,6 @@ if not os.path.exists(os.path.join(OUTPUT_DIR, sub_dir)): os.makedirs(os.path.join(OUTPUT_DIR, sub_dir)) -GLPKINOUT_PATH = resource_filename("pyproteininference", "../tests/glpkinout/") -SKIP_RUNNING_GLPK = True - LEAD_OUTPUT_FILE_NONE = resource_filename( "pyproteininference", "../tests/output/leads/test_parsimony_no_grouping_q_value_leads_ml_posterior_error_prob.csv", @@ -50,9 +47,9 @@ PEPTIDES_INDEX = 6 -class TestLoadParsimonyGlpkNoneTypeParamsWorkflow(TestCase): +class TestLoadParsimonyPulpNoneTypeParamsWorkflow(TestCase): # @unittest.skip("Skipping Pulp Test, No CBC executable in build env") - def test_workflow_parsimony_glpk_none_type(self): + def test_workflow_parsimony_pulp_none_type(self): # STEP 1: Load parameter file # # STEP 1: Load parameter file # @@ -62,7 +59,6 @@ def test_workflow_parsimony_glpk_none_type(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "q_value") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, None) self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -150,10 +146,9 @@ def test_workflow_parsimony_glpk_none_type(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) diff --git a/tests/test_021_parsimony_shared_peptides_assignment.py b/tests/test_021_parsimony_shared_peptides_assignment.py index 926e740..0c002df 100644 --- a/tests/test_021_parsimony_shared_peptides_assignment.py +++ b/tests/test_021_parsimony_shared_peptides_assignment.py @@ -11,15 +11,13 @@ TEST_DATABASE = resource_filename("pyproteininference", "../tests/data/test_database.fasta") TARGET_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_target.txt") DECOY_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_decoy.txt") -PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_parsimony_glpk.yaml") +PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_parsimony_pulp.yaml") OUTPUT_DIR = tempfile.gettempdir() # OUTPUT_DIR = resource_filename('pyproteininference', '../tests/output/') for sub_dir in ["leads", "all", "peptides", "psms", "psm_ids"]: if not os.path.exists(os.path.join(OUTPUT_DIR, sub_dir)): os.makedirs(os.path.join(OUTPUT_DIR, sub_dir)) -GLPKINOUT_PATH = resource_filename("pyproteininference", "../tests/glpkinout/") -SKIP_RUNNING_GLPK = True IDENTIFIER_INDEX = 0 SCORE_INDEX = 1 @@ -28,8 +26,8 @@ PEPTIDES_INDEX = 6 -class TestLoadParsimonyGlpkWorkflowSharedPeptideReassignment(TestCase): - def test_workflow_parsimony_glpk_shared_peptides_best(self): +class TestLoadParsimonyPulpWorkflowSharedPeptideReassignment(TestCase): + def test_workflow_parsimony_pulp_shared_peptides_best(self): # STEP 1: Load parameter file # # STEP 1: Load parameter file # @@ -109,10 +107,9 @@ def test_workflow_parsimony_glpk_shared_peptides_best(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data, digest=digest) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data, digest=digest) @@ -133,7 +130,7 @@ def test_workflow_parsimony_glpk_shared_peptides_best(self): # Make sure the there are no duplicate psms in our lead proteins self.assertEqual(len(psm_id_list), len(set(psm_id_list))) - def test_workflow_parsimony_glpk_shared_peptides_all(self): + def test_workflow_parsimony_pulp_shared_peptides_all(self): # Do the same analysis but put shared peptides everywhere @@ -215,10 +212,9 @@ def test_workflow_parsimony_glpk_shared_peptides_all(self): # STEP 10: Apply Inference inference_type = protein_inference_parameters_all.inference_type - # For parsimony... Run GLPK setup, runner, grouper... if inference_type == pyproteininference.inference.Inference.PARSIMONY: group = pyproteininference.inference.Parsimony(data=data_all, digest=digest_all) - group.infer_proteins(glpkinout_directory=GLPKINOUT_PATH, skip_running_glpk=SKIP_RUNNING_GLPK) + group.infer_proteins() if inference_type == pyproteininference.inference.Inference.INCLUSION: group = pyproteininference.inference.Inclusion(data=data_all, digest=digest_all) diff --git a/tests/test_023_malformed_parameters.py b/tests/test_023_malformed_parameters.py index 8f5eaf6..064a73b 100644 --- a/tests/test_023_malformed_parameters.py +++ b/tests/test_023_malformed_parameters.py @@ -301,7 +301,6 @@ def test_empty_parameters(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "peptides") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, "glpsol") self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) @@ -328,7 +327,6 @@ def test_reading_slim_params(self): self.assertEqual(protein_inference_parameters.digest_type, "trypsin") self.assertEqual(protein_inference_parameters.export, "peptides") self.assertEqual(protein_inference_parameters.fdr, 0.01) - self.assertEqual(protein_inference_parameters.glpk_path, "glpsol") self.assertEqual(protein_inference_parameters.missed_cleavages, 3) self.assertEqual(protein_inference_parameters.picker, True) self.assertEqual(protein_inference_parameters.restrict_pep, 0.9) diff --git a/tests/test_025_heuristic_pipeline.py b/tests/test_025_heuristic_pipeline.py index 994fcfe..9c5d1b9 100644 --- a/tests/test_025_heuristic_pipeline.py +++ b/tests/test_025_heuristic_pipeline.py @@ -9,8 +9,9 @@ TEST_DATABASE = resource_filename("pyproteininference", "../tests/data/test_database.fasta") TARGET_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_target.txt") DECOY_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_decoy.txt") -PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_parsimony_pulp.yaml") +PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_heuristic.yaml") OUTPUT_DIR = tempfile.gettempdir() +PDF_FILENAME = os.path.join(OUTPUT_DIR, "test_pyproteininference.pdf") # OUTPUT_DIR = resource_filename('pyproteininference', '../tests/output/') for sub_dir in ["leads", "all", "peptides", "psms", "psm_ids"]: if not os.path.exists(os.path.join(OUTPUT_DIR, sub_dir)): @@ -31,21 +32,31 @@ def test_workflow_heuristic_with_params(self): combined_directory=None, output_directory=OUTPUT_DIR, id_splitting=True, + pdf_filename=PDF_FILENAME, ) - hp.execute(skip_plot=True) + hp.execute() - self.assertEqual(hp.selected_method, "parsimony") + self.assertListEqual(hp.selected_methods, ["inclusion"]) - self.assertIsInstance(hp.selected_datastore, pyproteininference.datastore.DataStore) + self.assertIsInstance(hp.selected_datastores["inclusion"], pyproteininference.datastore.DataStore) - result1 = hp.determine_optimal_inference_method(empirical_threshold=0.5) - - self.assertEqual(result1, "exclusion") + result1 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.05, + upper_empirical_threshold=2, + lower_empirical_threshold=1, + pdf_filename=PDF_FILENAME, + ) - result2 = hp.determine_optimal_inference_method(empirical_threshold=1.2) + self.assertListEqual(result1, ["parsimony"]) - self.assertEqual(result2, "exclusion") + result2 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.1, + upper_empirical_threshold=0.5, + lower_empirical_threshold=0.25, + pdf_filename=PDF_FILENAME, + ) + self.assertListEqual(result2, ["inclusion"]) def test_workflow_heuristic_without_params(self): @@ -59,18 +70,119 @@ def test_workflow_heuristic_without_params(self): combined_directory=None, output_directory=OUTPUT_DIR, id_splitting=True, + pdf_filename=PDF_FILENAME, ) - hp.execute(skip_plot=True) + hp.execute() + + self.assertListEqual(hp.selected_methods, ["inclusion"]) + + self.assertIsInstance(hp.selected_datastores["inclusion"], pyproteininference.datastore.DataStore) + + result1 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.05, + upper_empirical_threshold=2, + lower_empirical_threshold=1, + pdf_filename=PDF_FILENAME, + ) + + self.assertListEqual(result1, ["parsimony"]) + + result2 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.1, + upper_empirical_threshold=0.5, + lower_empirical_threshold=0.25, + pdf_filename=PDF_FILENAME, + ) + + self.assertListEqual(result2, ["inclusion"]) + + def test_workflow_heuristic_optimal_export(self): + + hp = pyproteininference.heuristic.HeuristicPipeline( + parameter_file=PARAMETER_FILE, + database_file=TEST_DATABASE, + target_files=TARGET_FILE, + decoy_files=DECOY_FILE, + combined_files=None, + target_directory=None, + decoy_directory=None, + combined_directory=None, + output_directory=OUTPUT_DIR, + id_splitting=True, + pdf_filename=PDF_FILENAME, + output_type="optimal", + ) - self.assertEqual(hp.selected_method, "parsimony") + hp.execute() - self.assertIsInstance(hp.selected_datastore, pyproteininference.datastore.DataStore) + self.assertListEqual(hp.selected_methods, ["inclusion"]) - result1 = hp.determine_optimal_inference_method(empirical_threshold=0.5) + self.assertIsInstance(hp.selected_datastores["inclusion"], pyproteininference.datastore.DataStore) - self.assertEqual(result1, "exclusion") + result1 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.05, + upper_empirical_threshold=2, + lower_empirical_threshold=1, + pdf_filename=PDF_FILENAME, + ) - result2 = hp.determine_optimal_inference_method(empirical_threshold=1.2) + self.assertListEqual(result1, ["parsimony"]) + + result3 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.1, + upper_empirical_threshold=0.5, + lower_empirical_threshold=0.25, + pdf_filename=PDF_FILENAME, + ) + self.assertListEqual(result3, ["inclusion"]) + + def test_workflow_heuristic_with_different_thresholds(self): + + hp = pyproteininference.heuristic.HeuristicPipeline( + parameter_file=PARAMETER_FILE, + database_file=TEST_DATABASE, + target_files=TARGET_FILE, + decoy_files=DECOY_FILE, + combined_files=None, + target_directory=None, + decoy_directory=None, + combined_directory=None, + output_directory=OUTPUT_DIR, + id_splitting=True, + pdf_filename=PDF_FILENAME, + ) + + hp.execute() + + # Both inclusion/exclusion passing high upper threshold but neither peptide-centric or + # parsimony passing lower threshold + result1 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.05, + upper_empirical_threshold=10, + lower_empirical_threshold=0.1, + pdf_filename=PDF_FILENAME, + ) + + self.assertListEqual(result1, ["inclusion", "exclusion"]) + + # Neither inclusion/exclusion passing low upper threshold but peptide-centric and + # parsimony both passing high lower threshold + result2 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.1, + upper_empirical_threshold=2, + lower_empirical_threshold=2, + pdf_filename=PDF_FILENAME, + ) + + self.assertListEqual(result2, ["parsimony", "peptide_centric"]) + + # No methods passing thresholds + result3 = hp.determine_optimal_inference_method( + false_discovery_rate_threshold=0.1, + upper_empirical_threshold=0.00001, + lower_empirical_threshold=0.000001, + pdf_filename=PDF_FILENAME, + ) - self.assertEqual(result2, "exclusion") + self.assertListEqual(result3, ["inclusion"]) diff --git a/tests/test_027_reading_alt_proteins_from_input.py b/tests/test_027_reading_alt_proteins_from_input.py index 54c6982..84ae237 100644 --- a/tests/test_027_reading_alt_proteins_from_input.py +++ b/tests/test_027_reading_alt_proteins_from_input.py @@ -9,6 +9,7 @@ TARGET_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_target.txt") DECOY_FILE = resource_filename("pyproteininference", "../tests/data/test_perc_data_decoy.txt") PARAMETER_FILE = resource_filename("pyproteininference", "../tests/data/test_params_inclusion.yaml") +COMBINED_FILE = resource_filename("pyproteininference", "../tests/data/test_data_many_alternative_proteins.txt") class TestAltProteinFomInput(TestCase): @@ -77,3 +78,173 @@ def test_read_alt_proteins(self): self.assertSetEqual(set(reader.psms[i].possible_proteins), set(expected_result[i])) self.assertEqual(len(reader.psms), 27) + + def test_read_alt_proteins_over_maximum(self): + + protein_inference_parameters = ProteinInferenceParameter(yaml_param_filepath=PARAMETER_FILE) + + digest = in_silico_digest.PyteomicsDigest( + database_path=None, + digest_type=protein_inference_parameters.digest_type, + missed_cleavages=protein_inference_parameters.missed_cleavages, + reviewed_identifier_symbol=protein_inference_parameters.reviewed_identifier_symbol, + max_peptide_length=protein_inference_parameters.restrict_peptide_length, + id_splitting=True, + ) + + # Try with generic reader + + reader = pyproteininference.reader.GenericReader( + combined_files=COMBINED_FILE, + parameter_file_object=protein_inference_parameters, + digest=digest, + append_alt_from_db=True, + ) + reader.read_psms() + + self.assertEqual(reader.MAX_ALLOWED_ALTERNATIVE_PROTEINS, 50) + + protein_list1 = [ + 'Protein1', + 'Protein10', + 'Protein11', + 'Protein12', + 'Protein13', + 'Protein14', + 'Protein15', + 'Protein16', + 'Protein17', + 'Protein18', + 'Protein19', + 'Protein2', + 'Protein20', + 'Protein21', + 'Protein22', + 'Protein23', + 'Protein24', + 'Protein25', + 'Protein26', + 'Protein27', + 'Protein28', + 'Protein29', + 'Protein3', + 'Protein30', + 'Protein31', + 'Protein32', + 'Protein33', + 'Protein34', + 'Protein35', + 'Protein36', + 'Protein37', + 'Protein38', + 'Protein39', + 'Protein4', + 'Protein40', + 'Protein41', + 'Protein42', + 'Protein43', + 'Protein44', + 'Protein45', + 'Protein46', + 'Protein47', + 'Protein48', + 'Protein49', + 'Protein5', + 'Protein50', + 'Protein51', + 'Protein52', + 'Protein53', + 'Protein54', + ] + + protein_list2 = [ + '##Protein1', + '##Protein10', + '##Protein11', + '##Protein12', + '##Protein13', + '##Protein14', + '##Protein15', + '##Protein16', + '##Protein17', + '##Protein18', + '##Protein19', + '##Protein2', + '##Protein20', + '##Protein21', + '##Protein22', + '##Protein23', + '##Protein24', + '##Protein25', + '##Protein26', + '##Protein27', + '##Protein28', + '##Protein29', + '##Protein3', + '##Protein30', + '##Protein31', + '##Protein32', + '##Protein33', + '##Protein34', + '##Protein35', + '##Protein36', + '##Protein37', + '##Protein38', + '##Protein39', + '##Protein4', + '##Protein40', + '##Protein41', + '##Protein42', + '##Protein43', + '##Protein44', + '##Protein45', + '##Protein46', + '##Protein47', + '##Protein48', + '##Protein49', + '##Protein5', + '##Protein50', + '##Protein51', + '##Protein52', + '##Protein53', + '##Protein54', + ] + + self.assertListEqual( + reader.psms[0].possible_proteins, + protein_list1, + ) + self.assertListEqual( + reader.psms[1].possible_proteins, + protein_list2, + ) + + percdigest = in_silico_digest.PyteomicsDigest( + database_path=None, + digest_type=protein_inference_parameters.digest_type, + missed_cleavages=protein_inference_parameters.missed_cleavages, + reviewed_identifier_symbol=protein_inference_parameters.reviewed_identifier_symbol, + max_peptide_length=protein_inference_parameters.restrict_peptide_length, + id_splitting=True, + ) + + # Try with percolator reader + percreader = pyproteininference.reader.PercolatorReader( + combined_files=COMBINED_FILE, + parameter_file_object=protein_inference_parameters, + digest=percdigest, + append_alt_from_db=True, + ) + + percreader.read_psms() + + self.assertEqual(percreader.MAX_ALLOWED_ALTERNATIVE_PROTEINS, 50) + + self.assertListEqual( + percreader.psms[0].possible_proteins, + protein_list1, + ) + self.assertListEqual( + percreader.psms[1].possible_proteins, + protein_list2, + ) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..14e64ce --- /dev/null +++ b/tox.ini @@ -0,0 +1,63 @@ +# content of: tox.ini , put in same dir as setup.py +[tox] +envlist = format,lint,pytest +# envlist = format,flake8,pytest # Use this when COMPPROT-2993 is solved. It should be used by devs before commit. + +[testenv] +basepython = python3 +deps = -rrequirements.txt +envdir = {toxworkdir}/run +skip_install = true + +[testenv:black] +envdir = {toxworkdir}/black +deps = black==22.3.0 +commands = + {envpython} -m black {posargs} + +[testenv:flake8] +envdir = {toxworkdir}/flake +deps = flake8==3.9.2 +commands = + {envpython} -m flake8 {posargs} + +[testenv:pytest] +pip_pre = true +skip_install = false +envdir = {toxworkdir}/test +deps = + {[testenv]deps} + pytest==6.2.5 + pytest-cov==2.8.1 + pytest-custom-exit-code==0.3.0 +extras = tests +commands = + {envpython} -m pytest -s --junitxml=test_result_junit.xml --cov=pyproteininference --cov-report=term --cov-report=xml --cov-branch --suppress-no-test-exit-code {posargs} + +[testenv:format] +envdir = {[testenv:black]envdir} +deps = {[testenv:black]deps} +commands = {[testenv:black]commands} . --exclude=/(\.direnv|\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|venv|\.svn|_build|buck-out|build|dist|__pypackages__)/ --check + +[testenv:lint] +envdir = {[testenv:flake8]envdir} +deps = {[testenv:flake8]deps} +commands = {[testenv:flake8]commands} --format=pylint --exit-zero --output-file=flake8.out --tee + +[testenv:test] +pip_pre = true +skip_install = false +envdir = {[testenv:pytest]envdir} +deps = {[testenv:pytest]deps} +extras = {[testenv:pytest]extras} +commands = {[testenv:pytest]commands} tests/ + +[testenv:build-dist] +skipsdist = True +skip_install = True +deps = + -U + setuptools>=42 + wheel + setuptools_scm[toml]>=3.4 +commands = python setup.py sdist \ No newline at end of file