-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgwas_4_alz_matching.py
executable file
·115 lines (94 loc) · 5.66 KB
/
gwas_4_alz_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import pandas as pd
import sys
import ast
import shutil
import gc
def clear_directory(directory_path):
if os.path.exists(directory_path):
shutil.rmtree(directory_path)
os.makedirs(directory_path)
def match_csv_txt(gwas_directory, directory_1000_genomes, track_list):
chunksize = 100000
output_path = './gwas_4_alz_matching'
clear_directory(output_path)
gc.collect()
text_data = pd.DataFrame()
# Load text file data into memory and filter relevant columns
text_data = pd.read_csv(gwas_directory, sep='\t' ,names = ['chr', 'PosGRCh37', 'testedAllele', 'otherAllele', 'z', 'p', 'N'])
text_data = text_data[['chr', 'PosGRCh37' ,'testedAllele', 'otherAllele', 'z', 'p', 'N']]
print(len(text_data))
sum_stats_2019 = pd.read_csv('gwas_4_alz_sum_stats_2019_Kunkle', sep = '\t')
text_with_snp_ids = pd.merge(text_data, pd.DataFrame(sum_stats_2019[['SNP', 'CHR', "BP"]]), left_on =['chr','PosGRCh37' ], right_on=['CHR', "BP"], how='left')
print(len(text_with_snp_ids))
#significant_with_snp_ids that are nan look up in another dataframe called gwas_1_matching
gwas_1_matching = pd.read_csv('gwas_1_matching/result_SAD0.csv')
nan_snp_ids = text_with_snp_ids[text_with_snp_ids['SNP'].isna()]
nan_snp_ids = pd.merge(nan_snp_ids, gwas_1_matching[['chr', 'pos', 'snp']], left_on=['chr', 'PosGRCh37'], right_on=['chr', 'pos'], how='left')
text_with_snp_ids.loc[text_with_snp_ids['SNP'].isna(), 'SNP'] = nan_snp_ids['snp']
text_data = text_with_snp_ids[text_with_snp_ids['SNP'].notna()]
text_data['SNP'] = text_data['SNP'].astype(str)
print(len(text_data))
csv_columns_no_sad = ['alt', 'chr_x', 'pos', 'ref', 'snp']
csv_match_columns = ['snp']
# Dictionary to keep track of which files have headers written
headers_written = {sad: False for sad in track_list}
# Process each CSV file in the 1000 genomes directory
csv_files = [f for f in os.listdir(directory_1000_genomes) if f.endswith('.csv') and f != 'targets.csv']
total_files = len(csv_files)
for i, csv_file in enumerate(csv_files):
csv_file_path = os.path.join(directory_1000_genomes, csv_file)
print(f"Processing file {i+1}/{total_files}: {csv_file}")
for chunk_index, chunk in enumerate(pd.read_csv(csv_file_path, chunksize=chunksize)):
print(f"Processing chunk {chunk_index+1} of {csv_file}")
chunk['snp'] = chunk['snp'].astype(str)
if 'snp' not in chunk.columns:
print(f"'snp' column not found in chunk {chunk_index+1} of {csv_file}")
continue
# Merge the entire chunk once with the text data
merged_chunk = chunk.merge(text_data, how='left', left_on=['snp'], right_on=['SNP'])
#print(merged_chunk[['snp', 'SNP']].head())
for track in track_list:
sad_column = f'SAD{track}'
if sad_column not in merged_chunk.columns:
print(f"{sad_column} not found in chunk {chunk_index+1} of {csv_file}")
continue
chunk_filtered_sad = merged_chunk[[sad_column] + csv_columns_no_sad + ['chr_y', 'PosGRCh37' ,'testedAllele', 'otherAllele', 'z', 'p', 'N', 'SNP']].reset_index(drop=True)
# Define the output file path for the current track
track_output_file_path = os.path.join(output_path, f'result_SAD{track}.csv')
# Write the merged chunk to the appropriate file
if not headers_written[track]:
chunk_filtered_sad.to_csv(track_output_file_path, index=False, mode='a', header=True)
headers_written[track] = True
else:
chunk_filtered_sad.to_csv(track_output_file_path, index=False, mode='a', header=False)
# Load the first result file to get the matched MarkerNames
new_df = pd.read_csv(os.path.join(output_path, 'result_SAD0.csv'))
matched_markernames = set(new_df['SNP'])
#print(matched_markernames)
# Remove matched MarkerNames from text_data
remaining_text_data = text_data[~text_data['SNP'].isin(matched_markernames)].reset_index()
print(len(remaining_text_data))
# Add remaining text rows to each combined file with null values for the columns from the CSV files
output_path = './gwas_4_alz_matching'
for track in track_list:
track_output_file_path = os.path.join(output_path, f'result_SAD{track}.csv')
remaining_text_data_with_nulls = remaining_text_data.copy()
for col in [f'SAD{track}'] + csv_columns_no_sad:
remaining_text_data_with_nulls[col] = pd.NA
if not remaining_text_data_with_nulls.empty:
remaining_text_data_with_nulls = remaining_text_data_with_nulls[[f'SAD{track}'] + csv_columns_no_sad + ['chr', 'PosGRCh37' ,'testedAllele', 'otherAllele', 'z', 'p', 'N', 'SNP']]
remaining_text_data_with_nulls.to_csv(track_output_file_path, index=False, mode='a', header=False)
print("Processing complete.")
def main(directory_gwas_study_text, directory_1000_genomes, track_list):
track_list = ast.literal_eval(track_list) # Convert string representation of list to actual list
match_csv_txt(directory_gwas_study_text, directory_1000_genomes, track_list)
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python script.py <path for GWAS text> <directory for 1000 genomes csvs> <track list e.g. [1,2,3]>")
else:
directory_gwas_study_text = sys.argv[1]
directory_1000_genomes = sys.argv[2]
track_list = sys.argv[3]
main(directory_gwas_study_text, directory_1000_genomes, track_list)
sys.exit()