Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
akotlar committed Oct 10, 2024
1 parent e1bcdb6 commit b490501
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 41 deletions.
5 changes: 1 addition & 4 deletions python/python/bystro/proteomics/annotation_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,8 +951,6 @@ async def async_run_annotation_query(
query (dict[str, Any]): The OpenSearch query.
index_name (str): The name of the index.
fields (list[str] | None): Additional fields to include in the DataFrame, defaults to None.
cluster_opensearch_config (dict[str, Any] | None):
Configuration for the OpenSearch cluster,defaults to None.
bystro_api_auth (CachedAuth | None): Bystro API authentication, defaults to None.
additional_client_args (dict[str, Any] | None):
Additional arguments for the OpenSearch client, defaults to None.
Expand Down Expand Up @@ -1087,8 +1085,8 @@ async def async_get_annotation_result_from_query(
Args:
query_string (str): The query string to use for the search.
index_name (str): The name of the index to search.
bystro_api_auth (CachedAuth): The authentication for the Bystro API.
fields (list[str] | None): The fields to include in the results, defaults to None.
bystro_api_auth (CachedAuth | None): The authentication for the Bystro API, defaults to None.
additional_client_args (dict[str, Any] | None):
Additional arguments for the OpenSearch client, defaults to None.
structs_of_arrays (bool): Whether to return structs of arrays, defaults to True.
Expand Down Expand Up @@ -1189,7 +1187,6 @@ def get_annotation_result_from_query(
query_string,
index_name,
fields=fields,
cluster_opensearch_config=cluster_opensearch_config,
bystro_api_auth=bystro_api_auth,
additional_client_args=additional_client_args,
structs_of_arrays=structs_of_arrays,
Expand Down
123 changes: 86 additions & 37 deletions python/python/bystro/proteomics/tests/test_annotation_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import msgspec
import pytest

from bystro.api.auth import CachedAuth

from bystro.proteomics.annotation_interface import (
DOSAGE_GENERATED_COLUMN,
process_query_response,
Expand Down Expand Up @@ -104,6 +106,45 @@ async def close(self) -> None:
return


@pytest.mark.asyncio
async def test_legacy_get_annotation_results_from_query(mocker):
mocker.patch(
"bystro.proteomics.annotation_interface.AsyncOpenSearch",
return_value=MockAsyncOpenSearchLegacy(),
)
# inputRef doesn't exist in the legacy datasets, pre Q1-2024
mocker.patch("bystro.proteomics.annotation_interface.INPUT_REF_FIELD", "ref")
mocker.patch(
"bystro.proteomics.annotation_interface.ALWAYS_INCLUDED_FIELDS",
[
"chrom",
"pos",
"vcfPos",
"ref",
"alt",
"type",
"id",
],
)

query_string = "exonic (gnomad.genomes.af:<0.1 || gnomad.exomes.af:<0.1)"
index_name = "mock_index_name"

samples_and_genes_df = await async_get_annotation_result_from_query(
query_string,
index_name,
cluster_opensearch_config={
"connection": {
"nodes": ["http://localhost:9200"],
"request_timeout": 1200,
"use_ssl": False,
"verify_certs": False,
},
},
)
assert (1405, 52) == samples_and_genes_df.shape


@pytest.mark.asyncio
async def test_get_annotation_results_from_query_with_samples(mocker):
mocker.patch(
Expand Down Expand Up @@ -289,60 +330,68 @@ def test_process_response():
)


import asyncio

@pytest.mark.asyncio
async def test_join_annotation_result_to_fragpipe_dataset(mocker):
# Create a mock OpenSearch client
mock_opensearch_client = mocker.Mock()

# Wrap return values of async functions in asyncio.Future
mock_opensearch_client.create_point_in_time.return_value = asyncio.Future()
mock_opensearch_client.create_point_in_time.return_value.set_result({"pit_id": "mock_pit_id"})

mock_opensearch_client.close.return_value = asyncio.Future()
mock_opensearch_client.close.return_value.set_result(None)

mock_opensearch_client.delete_point_in_time.return_value = asyncio.Future()
mock_opensearch_client.delete_point_in_time.return_value.set_result(None)

# Wrap the search method's return value in a future to simulate async behavior
mock_opensearch_client.search.return_value = asyncio.Future()
mock_opensearch_client.search.return_value.set_result({"hits": {"hits": TEST_RESPONSES_WITH_SAMPLES}})

# Ensure indices.exists returns a future
mock_opensearch_client.indices.exists.return_value = asyncio.Future()
mock_opensearch_client.indices.exists.return_value.set_result(True)

# Patch the creation of the AsyncOpenSearch client to use the mock instead
mocker.patch("bystro.proteomics.annotation_interface.AsyncOpenSearch", return_value=mock_opensearch_client)

# Create and patch a mock CachedAuth instance
mock_bystro_api_auth = CachedAuth(email="test_user@gmail.com", access_token="123456", url="http://mockserver:9200")
mocker.patch("bystro.proteomics.annotation_interface.CachedAuth", return_value=mock_bystro_api_auth)

# Patch the function that creates the client to ensure it returns the mock client
mocker.patch(
"bystro.proteomics.annotation_interface.AsyncOpenSearch",
return_value=MockAsyncOpenSearch(TEST_RESPONSES_WITH_SAMPLES),
"bystro.proteomics.annotation_interface.get_async_proxied_opensearch_client",
return_value=mock_opensearch_client
)

# Mock the async_get_num_slices function to return a fixed number of slices
mocker.patch("bystro.proteomics.annotation_interface.async_get_num_slices", return_value=asyncio.Future())
mocker.patch("bystro.proteomics.annotation_interface.async_get_num_slices").return_value.set_result((1, None))

query_string = "exonic (gnomad.genomes.af:<0.1 || gnomad.exomes.af:<0.1)"
index_name = "foo"

# Run the test with the mocked client and auth
query_result_df = await async_get_annotation_result_from_query(
query_string,
index_name,
cluster_opensearch_config={
"connection": {
"nodes": ["http://localhost:9200"],
"request_timeout": 1200,
"use_ssl": False,
"verify_certs": False,
},
},
bystro_api_auth=mock_bystro_api_auth,
explode_field="refSeq.name2",
)

# Assert the shape of the returned dataframe
assert (582, 12) == query_result_df.shape

sample_ids = query_result_df[SAMPLE_GENERATED_COLUMN].unique()

abundance_file = str(Path(__file__).parent / "example_abundance_gene_MD.tsv")
experiment_file = str(Path(__file__).parent / "example_experiment_annotation_file.tsv")
tmt_dataset = load_tandem_mass_tag_dataset(abundance_file, experiment_file)

sample_names = list(tmt_dataset.annotation_df.index)[0 : sample_ids.shape[0]]

# replace the sample ids with the sample names
replacements = {sample_id: sample_name for sample_id, sample_name in zip(sample_ids, sample_names)}
query_result_df[SAMPLE_GENERATED_COLUMN] = query_result_df[SAMPLE_GENERATED_COLUMN].replace(
replacements
)

joined_df = join_annotation_result_to_proteomic_dataset(
query_result_df, tmt_dataset, proteomic_join_column=FRAGPIPE_GENE_GENE_NAME_COLUMN_RENAMED
)

assert (90, 17) == joined_df.shape

retained_fragpipe_columns = []
for name in FRAGPIPE_RENAMED_COLUMNS:
if name in [FRAGPIPE_SAMPLE_COLUMN, FRAGPIPE_GENE_GENE_NAME_COLUMN_RENAMED]:
continue
retained_fragpipe_columns.append(name)
# Further assertions to verify that the mock client was used and no real connections were made
mock_opensearch_client.search.assert_called_once()
mock_opensearch_client.create_point_in_time.assert_called_once()
mock_opensearch_client.delete_point_in_time.assert_called_once()
mock_opensearch_client.close.assert_called_once()

retained_fragpipe_columns.append(FRAGPIPE_SAMPLE_INTENSITY_COLUMN)
assert list(joined_df.columns) == list(query_result_df.columns) + retained_fragpipe_columns


@pytest.mark.asyncio
Expand Down

0 comments on commit b490501

Please sign in to comment.