Skip to content

Commit

Permalink
Ruff formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
aeverall committed Dec 31, 2024
1 parent 0eccd77 commit f651f17
Show file tree
Hide file tree
Showing 28 changed files with 9,264 additions and 4,453 deletions.
80 changes: 65 additions & 15 deletions src/signatures/associations/genotype/samples.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,92 @@
# Import standard modules
import sys, numpy as np, pandas as pd
from signatures.associations.sampleCuration.pan_cancer_sample import getSamples
import sys

import numpy as np
import pandas as pd

if __name__=='__main__':
from signatures.associations.sampleCuration.pan_cancer_sample import getSamples

if __name__ == "__main__":
# filename to save covariates into
filename_output = sys.argv[1]

# GET DATA - include hypermutated samples
print("Load samples")
sample_df = getSamples(PC=True, signatures=True, primary=True, ethnicity=False, hyper=True, ER=False, MSI=False)
sample_df = getSamples(
PC=True,
signatures=True,
primary=True,
ethnicity=False,
hyper=True,
ER=False,
MSI=False,
)

# Remove samples where sex doesn't match organ
sample_df = sample_df[ ( ~(sample_df.tumour_tissue.map(lambda x: x in ["Breast","Ovary","Uterus"])&(sample_df.is_female==-1)) ) &\
( ~(sample_df.tumour_tissue.map(lambda x: x in ["Testis", "Prostate"])&(sample_df.is_female==1)) ) ]
sample_df = sample_df[
(
~(
sample_df.tumour_tissue.map(
lambda x: x in ["Breast", "Ovary", "Uterus"]
)
& (sample_df.is_female == -1)
)
)
& (
~(
sample_df.tumour_tissue.map(lambda x: x in ["Testis", "Prostate"])
& (sample_df.is_female == 1)
)
)
]
print("Removed male samples for Breast,Ovary,Uterus and female for Testis,Prostate")
print(f"Number of samples: {len(sample_df)}")

# Confounding variables
variables = ['log_age', 'is_female'] + [f"pc{i}" for i in range(1,4)] # logit_purity',
variables = ["log_age", "is_female"] + [
f"pc{i}" for i in range(1, 4)
] # logit_purity',

# Construct X matrix
print("Construct X")
X = pd.DataFrame({variable:sample_df[variable] for variable in variables+['tumour_tissue', 'tumour_group',
'participant_id', 'tumour_sample_platekey', 'germline_sample_platekey']})
X = pd.DataFrame(
{
variable: sample_df[variable]
for variable in variables
+ [
"tumour_tissue",
"tumour_group",
"participant_id",
"tumour_sample_platekey",
"germline_sample_platekey",
]
}
)

# Get non-nan values
[print(key, np.sum(~np.isnan(X[key]))) for key in X.columns if X[key].dtype in [float, int]]
cuts = np.prod(np.array([~np.isnan(X[key]) for key in X.columns if X[key].dtype in [float, int]]),
axis=0).astype(bool)
[
print(key, np.sum(~np.isnan(X[key])))
for key in X.columns
if X[key].dtype in [float, int]
]
cuts = np.prod(
np.array(
[~np.isnan(X[key]) for key in X.columns if X[key].dtype in [float, int]]
),
axis=0,
).astype(bool)
print(f"Non-nan: {np.sum(cuts)}")

# Apply cuts
X = X[cuts]

# Save
X['sample_id'] = X.participant_id.astype(str)+"_"+X.tumour_sample_platekey+"_"+X.germline_sample_platekey
X.rename(columns={"tumour_tissue":"group"}, inplace=True)
X["sample_id"] = (
X.participant_id.astype(str)
+ "_"
+ X.tumour_sample_platekey
+ "_"
+ X.germline_sample_platekey
)
X.rename(columns={"tumour_tissue": "group"}, inplace=True)
print(X.columns)
X[["sample_id", "group"] + variables].to_csv(filename_output, index=False, sep="\t")
198 changes: 152 additions & 46 deletions src/signatures/associations/genotype/samples_and_targets.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,204 @@
# Import standard modules
import sys, os, numpy as np, pandas as pd
import os
import re
from signatures.associations.sampleCuration.pan_cancer_sample import getSamples
from dotenv import load_dotenv
import sys
import warnings

warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
DATA_DIR = os.getenv('DATA_DIR')
from signatures.associations.sampleCuration.pan_cancer_sample import getSamples

warnings.filterwarnings("ignore")

if __name__=="__main__":
load_dotenv()
DATA_DIR = os.getenv("DATA_DIR")

if __name__ == "__main__":
samples_file, targets_file = sys.argv[1:]

# Get sample dataframe
print("Load samples")
sample_df = getSamples(hyper=True)
sample_df['sample_id'] = sample_df.participant_id.astype(str)+"_"+sample_df.tumour_sample_platekey+"_"+sample_df.germline_sample_platekey
sample_df["sample_id"] = (
sample_df.participant_id.astype(str)
+ "_"
+ sample_df.tumour_sample_platekey
+ "_"
+ sample_df.germline_sample_platekey
)

# Remove samples where sex doesn't match organ
sample_df = sample_df[ ( ~(sample_df.tumour_tissue.map(lambda x: x in ["Ovary","Uterus"])&(sample_df.is_female==-1)) ) &\
( ~(sample_df.tumour_tissue.map(lambda x: x in ["Testis", "Prostate"])&(sample_df.is_female==1)) ) ]
sample_df = sample_df[
(
~(
sample_df.tumour_tissue.map(lambda x: x in ["Ovary", "Uterus"])
& (sample_df.is_female == -1)
)
)
& (
~(
sample_df.tumour_tissue.map(lambda x: x in ["Testis", "Prostate"])
& (sample_df.is_female == 1)
)
)
]
print("Removed male samples for Ovary, Uterus and female for Testis, Prostate")
print(f"Number of samples: {len(sample_df)}")

# Get genotype grid
genotype_df = pd.read_csv(f"{DATA_DIR}/pancancer_mutationTimeR_updated_CNACalls_2023_09_16.tsv",
usecols=['participant_id', 'tumour_sample_platekey', 'germline_sample_platekey', 'is_WGD'], sep="\t") # 'ploidy',
genotype_df = pd.read_csv(
f"{DATA_DIR}/pancancer_mutationTimeR_updated_CNACalls_2023_09_16.tsv",
usecols=[
"participant_id",
"tumour_sample_platekey",
"germline_sample_platekey",
"is_WGD",
],
sep="\t",
) # 'ploidy',
# genotype_df['ploidy'] = (genotype_df.ploidy>2.5).astype(int)
genotype_df['is_WGD'] = genotype_df.is_WGD.astype(int)
genotype_df['sample_id'] = genotype_df.participant_id.astype(str)+"_"+genotype_df.tumour_sample_platekey+"_"+genotype_df.germline_sample_platekey
genotype_df = genotype_df.drop(np.intersect1d(genotype_df.keys(),
['participant_id', 'tumour_sample_platekey', 'germline_sample_platekey',
'day_sampling', 'sample_platekey', 'tumour_pseudo_id']), axis=1)
genotype_df["is_WGD"] = genotype_df.is_WGD.astype(int)
genotype_df["sample_id"] = (
genotype_df.participant_id.astype(str)
+ "_"
+ genotype_df.tumour_sample_platekey
+ "_"
+ genotype_df.germline_sample_platekey
)
genotype_df = genotype_df.drop(
np.intersect1d(
genotype_df.keys(),
[
"participant_id",
"tumour_sample_platekey",
"germline_sample_platekey",
"day_sampling",
"sample_platekey",
"tumour_pseudo_id",
],
),
axis=1,
)

# Get the number of chromothripsis and chromoplexy mutations
sv_dir = "/re_gecip/cancer_pan/rhoulston/analysisResults/SVClassification/"
sv_counts = pd.DataFrame()
for cohort in os.listdir(sv_dir):
if cohort!='README.md':
sv_counts = pd.concat((sv_counts,
pd.read_csv(f"{sv_dir}/{cohort}/summaryLevel/sampleClusterClassCounts.tsv", sep="\t")))
sv_counts['sample_id'] = sv_counts.participant_id.astype(str)+"_"+sv_counts.tumour_sample_platekey+"_"+sv_counts.germline_sample_platekey
if cohort != "README.md":
sv_counts = pd.concat(
(
sv_counts,
pd.read_csv(
f"{sv_dir}/{cohort}/summaryLevel/sampleClusterClassCounts.tsv",
sep="\t",
),
)
)
sv_counts["sample_id"] = (
sv_counts.participant_id.astype(str)
+ "_"
+ sv_counts.tumour_sample_platekey
+ "_"
+ sv_counts.germline_sample_platekey
)

# sv_counts = sv_counts[~pd.isnull(sv_counts['n_chromothripsis'])]
# cluster_types = ['n_chromoplexy', 'n_chromothripsis', 'n_tandem_duplication']
cluster_types = ['n_tandem_duplication']
cluster_types = ["n_tandem_duplication"]

for cluster_type in cluster_types:
sv_counts[re.sub("^n_", "is_", cluster_type)] = (sv_counts[cluster_type]>0).astype(int)
genotype_df = pd.merge(genotype_df,
sv_counts[['sample_id']+cluster_types+[re.sub("^n_", "is_", cluster_type) for cluster_type in cluster_types]],
on='sample_id', how='inner')
sv_counts[re.sub("^n_", "is_", cluster_type)] = (
sv_counts[cluster_type] > 0
).astype(int)
genotype_df = pd.merge(
genotype_df,
sv_counts[
["sample_id"]
+ cluster_types
+ [re.sub("^n_", "is_", cluster_type) for cluster_type in cluster_types]
],
on="sample_id",
how="inner",
)

# Get the number of Kataegis events
ktg_dir = f"{DATA_DIR}/kataegis_analysis/results/SigProfilerClusters_persample/summary/"
ktg_counts = pd.read_csv(f"{ktg_dir}/All_10983_clustered_events_table.tsv", sep="\t",usecols=['sample_id', 'kataegis'])
ktg_counts.rename({'kataegis':'n_kataegis'}, axis=1, inplace=True)
ktg_counts = ktg_counts[~pd.isnull(ktg_counts['n_kataegis'])]
cluster_types = ['n_kataegis']
ktg_dir = (
f"{DATA_DIR}/kataegis_analysis/results/SigProfilerClusters_persample/summary/"
)
ktg_counts = pd.read_csv(
f"{ktg_dir}/All_10983_clustered_events_table.tsv",
sep="\t",
usecols=["sample_id", "kataegis"],
)
ktg_counts.rename({"kataegis": "n_kataegis"}, axis=1, inplace=True)
ktg_counts = ktg_counts[~pd.isnull(ktg_counts["n_kataegis"])]
cluster_types = ["n_kataegis"]
for cluster_type in cluster_types:
ktg_counts[re.sub("^n_", "is_", cluster_type)] = (ktg_counts[cluster_type]>0).astype(int)
genotype_df = pd.merge(genotype_df,
ktg_counts[['sample_id']+cluster_types+[re.sub("^n_", "is_", cluster_type) for cluster_type in cluster_types]],
on='sample_id', how='inner')
ktg_counts[re.sub("^n_", "is_", cluster_type)] = (
ktg_counts[cluster_type] > 0
).astype(int)
genotype_df = pd.merge(
genotype_df,
ktg_counts[
["sample_id"]
+ cluster_types
+ [re.sub("^n_", "is_", cluster_type) for cluster_type in cluster_types]
],
on="sample_id",
how="inner",
)

# Sample list which is also in NCRAS
sample_list = np.intersect1d(genotype_df.sample_id, sample_df.sample_id)
sample_df = sample_df.set_index("sample_id").loc[sample_list]
genotype_df = genotype_df.set_index("sample_id").loc[sample_list]

# Confounding variables
variables = ['log_age', 'is_female'] + [f"pc{i}" for i in range(1,4)]
variables = ["log_age", "is_female"] + [f"pc{i}" for i in range(1, 4)]

# Construct X matrix
print("Construct X")
X = pd.DataFrame({variable:sample_df[variable] for variable in variables+['tumour_tissue', 'tumour_group',
'participant_id',
'tumour_sample_platekey',
'germline_sample_platekey']})
X = pd.DataFrame(
{
variable: sample_df[variable]
for variable in variables
+ [
"tumour_tissue",
"tumour_group",
"participant_id",
"tumour_sample_platekey",
"germline_sample_platekey",
]
}
)

# Get non-nan values
[print(key, np.sum(~np.isnan(X[key]))) for key in X.columns if X[key].dtype in [float, int]]
cuts = np.prod(np.array([~np.isnan(X[key]) for key in X.columns if X[key].dtype in [float, int]]),
axis=0).astype(bool)
[
print(key, np.sum(~np.isnan(X[key])))
for key in X.columns
if X[key].dtype in [float, int]
]
cuts = np.prod(
np.array(
[~np.isnan(X[key]) for key in X.columns if X[key].dtype in [float, int]]
),
axis=0,
).astype(bool)
print(f"Non-nan: {np.sum(cuts)}")

# Apply cuts
X = X[cuts]

# Save
X['sample_id'] = X.participant_id.astype(str)+"_"+X.tumour_sample_platekey+"_"+X.germline_sample_platekey
X.rename(columns={"tumour_group":"group"}, inplace=True)
X["sample_id"] = (
X.participant_id.astype(str)
+ "_"
+ X.tumour_sample_platekey
+ "_"
+ X.germline_sample_platekey
)
X.rename(columns={"tumour_group": "group"}, inplace=True)
print(X.columns)
X[["sample_id", "group"] + variables].to_csv(samples_file, index=False, sep="\t")

Expand Down
Loading

0 comments on commit f651f17

Please sign in to comment.