Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Sh1384 authored Jul 20, 2023
1 parent 88a22ed commit 9bcddef
Show file tree
Hide file tree
Showing 9 changed files with 99,471 additions and 0 deletions.
81,781 changes: 81,781 additions & 0 deletions Figure_1/TAD_DNA_Methylation.ipynb

Large diffs are not rendered by default.

2,282 changes: 2,282 additions & 0 deletions Figure_1/TAD_kmeans_Clustering.ipynb

Large diffs are not rendered by default.

1,833 changes: 1,833 additions & 0 deletions Figure_1/TAD_meQTL_Burden.ipynb

Large diffs are not rendered by default.

1,763 changes: 1,763 additions & 0 deletions Figure_2/Germline_meQTL_Distribution.ipynb

Large diffs are not rendered by default.

217 changes: 217 additions & 0 deletions Figure_2/shuffled_tad_consecutive_distribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import pandas as pd
import numpy as np
import matplotlib.patches as mpatches
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import stats
from scipy.interpolate import make_interp_spline
import seaborn as sns; sns.set_theme(color_codes=True)
import os
import matplotlib.pyplot as plt
import sklearn.cluster
import sklearn.metrics
import sklearn.datasets
import random
import warnings
import argparse
warnings.filterwarnings("ignore")
import matplotlib
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42
sns.set_style(style='white')

all_labels = dict()

#Shuffle TADs With Random Labels
def shuffle_tad_labels(tads):
rand_labels = tads.copy()
labels = []
for ch in all_labels:
tad_ch = rand_labels[rand_labels["chr1"] == ch]
rand_categories = all_labels[ch].copy()
random.shuffle(rand_categories)
random_labels = []
for j in range(len(rand_categories)):
cat = rand_categories[j]
lst = cat.split("-")
if j == len(rand_categories)-1:
random_labels.append(lst[0])
random_labels.append(lst[1])
else:
random_labels.append(lst[0])
ind = 0
for i, row in tad_ch.iterrows():
if "Boundary" in row["Label"]:
labels.append("Boundary")
else:
labels.append(random_labels[ind])
ind += 1
rand_labels["Label"] = labels
return rand_labels


def count_clumped_meqtls(chrom, x1, x2, clumped_meqtls):
common_chr = clumped_meqtls[clumped_meqtls["CHR"] == str(chrom)]
snps = common_chr[(common_chr["BP"] >= x1) & (common_chr["BP"] <= x2)]
return len(snps)

# Bin the Genomic Space Between Two TADs in 40 Equal Bins and Calculate Normalized Burden of meQTLs in Each Bin
def consecutive_tad_distribution(tads, clumped_meqtls):
dic = dict()
active_active_normalized = []
active_inactive_normalized = []
inactive_active_normalized = []
inactive_inactive_normalized = []
num_bins = 40
start_boundary = []
end_boundary = []

for i in range(num_bins):
active_active_normalized.append([])
active_inactive_normalized.append([])
inactive_active_normalized.append([])
inactive_inactive_normalized.append([])

for ind in range(0, len(tads)):
# print(ind)
# if ind%105 == 0:
# print(ind)
if ind+3>=len(tads):
break
if tads.iloc[ind]["chr1"] != tads.iloc[ind+3]["chr1"]:
continue
if "Boundary" in tads.iloc[ind]["Label"]:
continue
bin_size = (tads.iloc[ind+3]["x2"]-tads.iloc[ind]["x1"])//(num_bins)
#bin_size = 25000
temp = tads.iloc[ind]["Label"]
label1 = tads.iloc[ind]["Label"]
label2 = tads.iloc[ind+3]["Label"]
if label1 == "Mixed" or label2 == "Mixed":
continue
start = tads.iloc[ind]["x1"]
inx = 0
while (start<=tads.iloc[ind+3]["x2"]-bin_size):
if start>=tads.iloc[ind]["x2"] and start<tads.iloc[ind+2]["x1"]:
if "Boundary" not in temp:
start_boundary.append(inx)
temp = "Boundary"
elif start+bin_size<=tads.iloc[ind]["x2"]:
temp = tads.iloc[ind]["Label"]
elif start >= tads.iloc[ind+3]["x1"]:
if temp == "Boundary":
end_boundary.append(inx)
temp = tads.iloc[ind+3]["Label"]
if label1 == "Active" and label2 == "Active":
if inx<len(active_active_normalized):
active_active_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
else:
active_active_normalized.append([])
active_active_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
if label1 == "Active" and label2 == "Inactive":
if inx<len(active_inactive_normalized):
active_inactive_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
else:
active_inactive_normalized.append([])
active_inactive_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
if label1 == "Inactive" and label2 == "Active":
if inx<len(inactive_active_normalized):
inactive_active_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
else:
inactive_active_normalized.append([])
inactive_active_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
if label1 == "Inactive" and label2 == "Inactive":
if inx<len(inactive_inactive_normalized):
inactive_inactive_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
else:
inactive_inactive_normalized.append([])
inactive_inactive_normalized[inx].append(count_clumped_meqtls(tads.iloc[ind]["chr1"], start, start+bin_size, clumped_meqtls)*(1000)/(bin_size))
start = start + bin_size
inx+=1
# break
return active_active_normalized, active_inactive_normalized, inactive_active_normalized, inactive_inactive_normalized, start_boundary, end_boundary



def main(args):
common_TADs_meQTLs = pd.read_csv("../TAD_Data/common_TADs_proper_annot.csv")
new_x1 = []
new_x2 = []
for i, row in common_TADs_meQTLs.iterrows():
new_x1.append(int(row["x1"])/1000)
new_x2.append(int(row["x2"])/1000)
common_TADs_meQTLs["x1"] = new_x1
common_TADs_meQTLs["x2"] = new_x2
common_TADs_meQTLs = common_TADs_meQTLs.sort_values(["chr1", "x1"])
common_TADs_meQTLs = common_TADs_meQTLs.reset_index(drop=True)
common_TADs_meQTLs_noboundary = common_TADs_meQTLs[~common_TADs_meQTLs["Label"].str.contains("Boundary")]
common_TADs_meQTLs_noboundary = common_TADs_meQTLs_noboundary.reset_index(drop=True)
for ch in list(pd.unique(common_TADs_meQTLs_noboundary["chr1"])):
all_labels[ch] = []
for i, row in common_TADs_meQTLs_noboundary.iterrows():
if i<len(common_TADs_meQTLs_noboundary)-1:
if row["chr1"] == common_TADs_meQTLs_noboundary.iloc[i+1]["chr1"]:
all_labels[row["chr1"]].append(row["Label"]+"-"+common_TADs_meQTLs_noboundary.iloc[i+1]["Label"])

tad_lengths = []
for i, row in common_TADs_meQTLs.iterrows():
tad_lengths.append(row["x2"]-row["x1"])
common_TADs_meQTLs["Length"] = tad_lengths
clumped_meqtls = pd.read_csv("../meQTL_Data/all_meqtls.clumped", delim_whitespace=True)
clumped_meqtls["CHR"] = clumped_meqtls["CHR"].astype(str)

# Run the Distribution Analysis on Randomly Shuffle TAD Labels in 100 Trials
active_active_normalized_total = []
active_inactive_normalized_total = []
inactive_active_normalized_total = []
inactive_inactive_normalized_total = []
start_boundary = 0
end_boundary = 0
num_trials = 100
active_active_diff_shuffled = []
active_inactive_diff_shuffled = []
inactive_active_diff_shuffled = []
inactive_inactive_diff_shuffled = []
for trial in range(num_trials):
#print(trial)
active_active_normalized = []
active_inactive_normalized = []
inactive_active_normalized = []
inactive_inactive_normalized = []
random_tads = shuffle_tad_labels(common_TADs_meQTLs)
active_active_normalized, active_inactive_normalized, inactive_active_normalized, inactive_inactive_normalized, start_boundary, end_boundary = consecutive_tad_distribution(random_tads, clumped_meqtls)

active_active_diff_shuffled.append(np.mean(active_active_normalized[int(np.mean(end_boundary))]) - np.mean(active_active_normalized[int(np.mean(start_boundary))]))
active_inactive_diff_shuffled.append(np.mean(active_inactive_normalized[int(np.mean(end_boundary))]) - np.mean(active_inactive_normalized[int(np.mean(start_boundary))]))
inactive_active_diff_shuffled.append(np.mean(inactive_active_normalized[int(np.mean(end_boundary))]) - np.mean(inactive_active_normalized[int(np.mean(start_boundary))]))
inactive_inactive_diff_shuffled.append(np.mean(inactive_inactive_normalized[int(np.mean(end_boundary))]) - np.mean(inactive_inactive_normalized[int(np.mean(start_boundary))]))

active_active_normalized_total.append(active_active_normalized)
active_inactive_normalized_total.append(active_inactive_normalized)
inactive_active_normalized_total.append(inactive_active_normalized)
inactive_inactive_normalized_total.append(inactive_inactive_normalized)

# Run the Distribution Analysis on the True TAD Label
active_active_normalized_true = []
active_inactive_normalized_true = []
inactive_active_normalized_true = []
inactive_inactive_normalized_true = []
active_active_normalized_true, active_inactive_normalized_true, inactive_active_normalized_true, inactive_inactive_normalized_true, start_boundary, end_boundary = consecutive_tad_distribution(common_TADs_meQTLs, clumped_meqtls)

active_active_diff = np.mean(active_active_normalized_true[int(np.mean(end_boundary))]) - np.mean(active_active_normalized_true[int(np.mean(start_boundary))])
active_inactive_diff = np.mean(active_inactive_normalized_true[int(np.mean(end_boundary))]) - np.mean(active_inactive_normalized_true[int(np.mean(start_boundary))])
inactive_active_diff = np.mean(inactive_active_normalized_true[int(np.mean(end_boundary))]) - np.mean(inactive_active_normalized_true[int(np.mean(start_boundary))])
inactive_inactive_diff = np.mean(inactive_inactive_normalized_true[int(np.mean(end_boundary))]) - np.mean(inactive_inactive_normalized_true[int(np.mean(start_boundary))])

#Compare the Difference in Burden of meQTLs Between Two Boundaries of the Actual and the Randomized TADs
#Use a 1-sample Student-T test with the Population Mean the True Mean in Difference of Burden Between TADs

print("Active-Active T-Test: ", stats.ttest_1samp(active_active_diff_shuffled, popmean=active_active_diff))
print("Active-Inactive T-Test: ", stats.ttest_1samp(active_inactive_diff_shuffled, popmean=active_inactive_diff))
print("Inactive-Active T-Test: ", stats.ttest_1samp(inactive_active_diff_shuffled, popmean=inactive_active_diff))
print("Inactive-Inactive T-Test: ", stats.ttest_1samp(inactive_inactive_diff_shuffled, popmean=inactive_inactive_diff))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
args = parser.parse_args()
main(args)
Loading

0 comments on commit 9bcddef

Please sign in to comment.