Skip to content

Commit

Permalink
fix(pipeline.py): correct grouped Jaccard calculation
Browse files Browse the repository at this point in the history
The pipeline now calculates the Jaccard scores for all pairings, similar to the ENFC calculations. A file percomparison is output, and these are saved at outputs/jaccard.

BREAKING CHANGE: Previously, the pipeline only calculated the jaccard scores for a single, hard-coded pairing. This has been corrected. All users should update to continue to get the correct results.
  • Loading branch information
rbpatt2019 committed Feb 14, 2022
1 parent bfadda5 commit c2c100c
Showing 1 changed file with 71 additions and 63 deletions.
134 changes: 71 additions & 63 deletions lta/helpers/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Tuple
from typing import Dict

import pandas as pd

Expand All @@ -29,6 +29,8 @@ class Pipeline:
As Python is 0-indexed, passing ``11`` will read in rows ``0-10``.
level : str
Metadata location of experimental conditions.
control : str
Value within self.level that represents the control condition.
compartment : str
Metadata location of sample tissue compartment.
mode : str
Expand All @@ -45,6 +47,7 @@ class Pipeline:
output: Path
n_rows_metadata: int
level: str
control: str
compartment: str
mode: str
sample_id: str
Expand Down Expand Up @@ -117,36 +120,44 @@ def __post_init__(self) -> None:
for group, df in data.groupby(axis="columns", level=self.mode)
}
Path(self.output, "enfc").mkdir(exist_ok=True, parents=True)
Path(self.output, "jaccard").mkdir(exist_ok=True, parents=True)

def _calculate_enfc(self, order: Tuple[str, str] = None) -> Dict[str, pd.DataFrame]:
conditions = [
df.columns.get_level_values(self.level).unique()
for df in self.filtered.values()
]
self.conditions = [
val for mode in conditions for val in mode if val != self.control
]

def _calculate_enfc(self) -> Dict[str, Dict[str, pd.DataFrame]]:
"""Calculate error-normalised fold change.
Calculates the ENFC for each compartment across modes.
For fold change to be meaningful,
order must be specified.
This will report fold-change as
``order[0] / order[1]``.
Parameters
----------
order : Tuple[str, str]
The experimental group labels.
logfc fill be ``order[0] / order[1]``.
``condition / self.control`` for all conditions
except control within self.value.
Returns
-------
Dict[str, pd.DataFrame]
Key is mode, value is the ENFC data
Dict[str, Dict[str, pd.DataFrame]]
Top level key is the experimental condition,
mapped to a dictionary of modes and ENFC results
"""
logger.info("Calculating ENFC...")
enfc = {
mode: df.groupby(axis="columns", level=self.compartment).agg(
dh.enfc,
axis="columns",
level=self.level,
order=order,
)
for mode, df in self.filtered.items()
group: {
mode: df.groupby(axis="columns", level=self.compartment).agg(
dh.enfc,
axis="columns",
level=self.level,
order=(group, self.control),
)
for mode, df in self.filtered.items()
}
for group in self.conditions
}
return enfc

Expand Down Expand Up @@ -324,7 +335,7 @@ def _get_n_lipids(self, n: int) -> Dict[str, pd.DataFrame]:

def _jaccard(
self, data: Dict[str, pd.DataFrame], group: str
) -> Dict[str, pd.DataFrame]:
) -> Dict[str, Dict[str, pd.DataFrame]]:
"""Calculate jaccard similarity and p-values.
This takes a dictionary of data.
Expand All @@ -345,20 +356,25 @@ def _jaccard(
Returns
-------
Dict[str, pd.DataFrame]
Dict[str, Dict[str, pd.DataFrame]]
Keys are the compartment group and mode.
Values are the table of Jaccard similarity and p-values.
"""
logger.info(f"Calculating Jaccard similarity for {group}...")
jaccard = {
mode: lipids.groupby(axis="index", level="Category").apply(
lambda x: jac.bootstrap(x.iloc[:, 0], x.iloc[:, 1], self.n)
)
for mode, lipids in data.items()
group: {
mode: lipids.groupby(axis="index", level="Category").apply(
lambda x: jac.bootstrap(
x.loc[:, group], x.loc[:, self.control], self.n
)
)
for mode, lipids in data.items()
}
for group in self.conditions
}
return jaccard

def run(self, control: str) -> None:
def run(self) -> None:
"""Run the full LTA pipeline.
This:
Expand All @@ -369,38 +385,25 @@ def run(self, control: str) -> None:
#. Finds B-lipids (both picky and consistent) and Jaccard distances.
#. Finds N2-lipids and Jaccard distances.
#. Writes combined results.
Parameters
----------
control : str
The control group for fold change.
logfc fill be each group divided by ``control``.
"""
conditions = [
df.columns.get_level_values(self.level).unique()
for df in self.filtered.values()
]
conditions = [val for mode in conditions for val in mode]
conditions = [val for val in conditions if val != control]

for group in conditions:
self.enfc = self._calculate_enfc((group, control))
logger.debug("Generating ENFC summary files...")
enfc = pd.concat(self.enfc, axis="columns")
enfc.to_csv(
self.output / "enfc" / f"{group}_by_{control}_individual_lipids.csv"
logger.debug("Generating ENFC summary files...")
self.enfcs = self._calculate_enfc()
for group, data in self.enfcs.items():
pd.concat(data, axis="columns").to_csv(
self.output
/ "enfc"
/ f"{group}_by_{self.control}_individual_lipids.csv"
)

logger.debug("Generating class ENFC summary files...")
self.filtered = {
mode: df.groupby(axis="index", level="Category").sum()
for mode, df in self.filtered.items()
}
for group in conditions:
self.enfc = self._calculate_enfc((group, control))
logger.debug("Generating class ENFC summary files...")
enfc = pd.concat(self.enfc, axis="columns")
enfc.to_csv(
self.output / "enfc" / f"{group}_by_{control}_lipid_classes.csv"
self.enfcs = self._calculate_enfc()
for group, data in self.enfcs.items():
pd.concat(data, axis="columns").to_csv(
self.output / "enfc" / f"{group}_by_{self.control}_lipid_classes.csv"
)

self.a_lipids = self._get_a_lipids()
Expand Down Expand Up @@ -435,16 +438,21 @@ def run(self, control: str) -> None:
self.output / "switch_lipid_classes.csv"
)

logger.debug("Generating Jaccard distanse summary files...")
jaccard = pd.concat(
{
**self.a_jaccard,
**self.bc_jaccard,
**self.bp_jaccard,
**self.n2_jaccard,
**self.u_jaccard,
},
axis="columns",
)
jaccard.columns.names = ["type_compartment_mode", "Metrics"]
jaccard.to_csv(self.output / "jaccard_similarity.csv")
logger.debug("Generating Jaccard distance summary files...")
for group in self.conditions:
jaccard = pd.concat(
{
**self.a_jaccard[group],
**self.bc_jaccard[group],
**self.bp_jaccard[group],
**self.n2_jaccard[group],
**self.u_jaccard[group],
},
axis="columns",
)
jaccard.columns.names = ["type_compartment_mode", "Metrics"]
jaccard.to_csv(
self.output
/ "jaccard"
/ f"{group}_to_{self.control}_jaccard_similarity.csv"
)

0 comments on commit c2c100c

Please sign in to comment.