dudeML_withoutSplit.py

from __future__ import division
import sys
import argparse

wgsim_path = "wgsim"
bedtools_path = "bedtools"
samtools_path = "samtools"

def rounder(x,y):
	return int(round(x / float(y))) * y

class SmartFormatter(argparse.HelpFormatter):
	def _split_lines(self, text, width):
		if text.startswith('R|'):
			return text[2:].splitlines()
		# this is the RawTextHelpFormatter._split_lines
		return argparse.HelpFormatter._split_lines(self, text, width)


parser=argparse.ArgumentParser(description='Predict CNVs using dudeML')
parser._positionals.title = 'possible modes (enter \'python3 dudeML.py modeName -h\' for modeName\'s help message'
subparsers = parser.add_subparsers(help='sub-command help')
parser_1 = subparsers.add_parser('predict', help='Predict CNVs in sample based on training classifier including ploidy or frequency of CNV.')
parser_2 = subparsers.add_parser('classify', help='Train a classifier based on a provided training set.')
parser_3 = subparsers.add_parser('winStat', help='Calculate average coverage of windows for a number of bases, given the window size, relative to the chromosomes average coverage.')
parser_4 = subparsers.add_parser('winStatExtra', help='Find averaged coverage of windows, based on previously estimated median coverage.')
parser_5 = subparsers.add_parser('fvecSample', help='Format sample/test file to create sets of windows to analyse as a features vector.')
parser_6 = subparsers.add_parser('fvecTrain', help='Format training file to ID windows with structural variants and create sets of windows to train as a features vector.')
parser_7 = subparsers.add_parser('subTrain', help='Subsample training file for quicker training of the predictor, can subsample a fraction (0.0-1.0) or a number (1-N).')
parser_8 = subparsers.add_parser('simChr', help='Simulate chromosomes containing duplications and deletions using the output of simCNV.')
parser_9 = subparsers.add_parser('simCNV', help='Simulate coordinates of duplications and deletions for multiple chromosomes, which can be combined later.')
parser_10 = subparsers.add_parser('recreateTotal', help='Create the total file from known CNVs for CNV chromosome simulation.')
parser_11 = subparsers.add_parser('covSummary', help='Summarise coverage by chromosome in coverage bedfile.')
parser_12 = subparsers.add_parser('simReads', help='Following simChr, uses WGsim to simulate reads across chromosomes.')
parser_13 = subparsers.add_parser('summarize', help='For a predictions file of known duplications and deletions, finds the number of correctly and falsely identified CNVs.')
parser_14 = subparsers.add_parser('ROC', help='If CNVs are known, works out the rate of true/false positives for given dataset (generated in fvecTrain) and classifier (generated in classify).')
parser_15 = subparsers.add_parser('quantify', help='Quantify CNVs across multiple samples mapped to the same reference.')

parser_1.add_argument('-i','--INPUT',help='Input bed file, generated by winStat and fvecSample.', required=True)
parser_1.add_argument('-o','--OUTPUT',help='Output file in bed format containing predicted CNVs.', required=True)
parser_1.add_argument('-t','--TRAIN',help='Training file or folder, generated by classify function.', required=True)
parser_1.set_defaults(mode='predict')

parser_2.add_argument('-i','--INPUT',help='Input bed file, generated by fvecTrain.', required=True)
parser_2.add_argument('-o','--OUTPUT',help='Output training file in binary format.', required=True)
parser_2.add_argument('-m','--MODEL',help='Type of classifier used, can be set as follows: "CNN" - Convolutional Neural Network, "DTC" - Decision Tree Classifier, "ETC100" - Extra Trees Classifier (100 estimators), "ETC500" - Extra Trees Classifier (500 estimators), "RFC100" - Random Forest Classifier (100 estimators), "RFC500" - Random Forest Classifier (500 estimators).' ,choices=["CNN","DTC","ETC100","ETC500","RFC100","RFC500"],default="RFC100")
parser_2.set_defaults(mode='classify')

parser_3.add_argument('-i','--INPUT',help='Input bed file, generated by genomeCoverageBed.', required=True)
parser_3.add_argument('-o','--OUTPUT',help='Output bed file summarizing stats in windows.', required=True)
parser_3.add_argument("-w",'--WINDOW_SIZE',help="The window size chosen to detect CNVs across.",type=int, default=50)
parser_3.add_argument("-s",'--STEP_SIZE',help="The step size chosen to detect CNVs across.",type=int, default=50)
parser_3.add_argument("-sum","--SUMMARY",help="Summary of coverages file",type=str)
parser_3.add_argument("-chr",'--CHROMOSOME',help="Bedfile of chromosomes to estimate statistics over with start and end of chromosomes.",type=str)
parser_3.set_defaults(mode='winStat')

parser_4.add_argument('-i','--INPUT',help='Input bed file, generated by genomeCoverageBed.', required=True)
parser_4.add_argument('-o','--OUTPUT',help='Output bed file summarizing stats in windows.', required=True)
parser_4.add_argument('-cov','--COVERAGE',help='Coverage to standardize by.', required=True)
parser_4.add_argument("-w",'--WINDOW_SIZE',help="The window size chosen to detect CNVs across.",type=int, default=50)
parser_4.add_argument("-s",'--STEP_SIZE',help="The step size chosen to detect CNVs across.",type=int, default=50)
parser_4.add_argument("-chr",'--CHROMOSOME',help="List of chromosomes to estimate statistics for. Can be a single chromosome, a comma seperated list or a file, with a chromosome on each line.",type=str)
parser_4.set_defaults(mode='winStatExtra')

parser_5.add_argument("-i",'--INPUT',help="Input file in bed format, containing stats on each window, generated by winStat.",required=True)
parser_5.add_argument("-o",'--OUTPUT',help="Output file in bed format, containing stats on focal window and surrounding windows.",required=True)
parser_5.add_argument("-TE",'--TE',help="Bed or GFF file containing repeat locations in genome.")
parser_5.add_argument("-id",'--ID',help="ID of sample analysed.",type=str,default="NA")
parser_5.add_argument("-d",'--DIRECTORY',help="Directory to write output files to.",type=str,default="")
parser_5.add_argument("-windows",'--WINDOWS',help="Number of windows around focal window to include.",type=int,default=5)
parser_5.add_argument("-w",'--WINDOW_SIZE',help="Window size (bp).",type=int,default=50)
parser_5.add_argument("-s",'--STEP_SIZE',help="Step size (bp).",type=int, default=50)
parser_5.add_argument("-c",'--CUTOFF',help="Ignore windows with a higher proportion of masked positions than the cut off.",type=float, default=0.01)
parser_5.set_defaults(mode='fvecSample')

parser_6.add_argument("-i",'--INPUT',help="Input file in bed format, containing stats on each window, generated by winStat.",required=True)
parser_6.add_argument("-o",'--OUTPUT',help="Output file in bed format, containing stats on focal window and surrounding windows.",required=True)
parser_6.add_argument("-TE",'--TE',help="Bed or GFF file containing repeat locations in genome.")
parser_6.add_argument("-dels","--DELETION",help="Bed file containing known deletion locations.",required=True)
parser_6.add_argument("-dups",'--DUPLICATION',help="Bed file containing known duplication locations.",required=True)
parser_6.add_argument("-d",'--DIRECTORY',help="Directory to write output files to.",type=str,default="")
parser_6.add_argument("-windows",'--WINDOWS',help="Number of windows around focal window to include.",type=int,default=5)
parser_6.add_argument("-w",'--WINDOW_SIZE',help="Window size (bp).",type=int,default=50)
parser_6.add_argument("-s",'--STEP_SIZE',help="Step size (bp).",type=int, default=50)
parser_6.add_argument("-c",'--CUTOFF',help="Ignore windows with more masked positions than the cut off.",type=float, default=0.01)
parser_6.set_defaults(mode='fvecTrain')

parser_7.add_argument("-i",'--INPUT',help="Input bed file containing training windows.",required=True)
parser_7.add_argument("-o",'--OUTPUT',help="Output subsampled bed file containing training windows",required=True)
parser_7.add_argument("-N","--NUMBER",help="Number of samples to extract (1+) or fraction to downsample to (0-0.99).",type=float,required=True)
parser_7.set_defaults(mode='subTrain')

parser_8.add_argument('-fasta',"--FASTA",help='Fasta file containing chromosomes to simulate CNVs in.', required=True)
parser_8.add_argument('-cnvBed',help='Bed file containing loci for CNVs to simulate.', required=True)
parser_8.add_argument("-id",'--ID',help="ID to label output files.",type=str,default="NA")
parser_8.add_argument("-d",'--DIRECTORY',help="Directory to write output files to.",type=str,default="")
parser_8.set_defaults(mode='simChr')

parser_9.add_argument("-fasta","--FASTA", required=True,help="Fasta file containing chromosomes to simulate CNVs in.")
parser_9.add_argument("-CNV",help="Number of duplications and deletions to simulate per megabase.",type=int,default=50)
parser_9.add_argument("-CNVsize",help="Mean size of CNV, size determined in a poisson distribution.",type=int,default=1000)
parser_9.add_argument("-delLength",help="Mean length of deletions to simulate.",type=int,default=1000)
parser_9.add_argument("-dupLength",help="Mean length of duplications to simulate.",type=int,default=1000)
parser_9.add_argument("-N","--NUMBER",help="Ploidy of chromosomes to simulate CNVs on.",type=int,default=1)
parser_9.add_argument("-d",'--DIRECTORY',help="Directory to write output files to.",type=str,default="")
parser_9.add_argument("-c",'--CUTOFF',help="Ignore windows with a higher proportion of masked positions than the cut off.",type=float, default=0.01)
parser_9.add_argument("-TE",'--TE',help="Bed or GFF file containing repeat locations in genome.")
parser_9.set_defaults(mode='simCNV')

parser_10.add_argument("-fasta","--FASTA",help="Fasta file containing chromosomes to simulate CNVs in.", required=True)
parser_10.add_argument("-dels","--DELETION",help="Bed file containing deletion loci.", required=True)
parser_10.add_argument("-dups",'--DUPLICATION',help="Bed file containing duplication loci", required=True)
parser_10.add_argument("-o",'--OUTPUT',help="Output file containing windows with and without CNVs.", required=True)
parser_10.add_argument("-d",'--DIRECTORY',help="Directory to write output files to.",type=str,default="")
parser_10.set_defaults(mode='recreateTotal')

parser_11.add_argument("-i",'--INPUT',required=True,help="Bed file generated by genomeCoverageBed.")
parser_11.add_argument("-chr",'--CHROMOSOME',help="List of chromosomes to summarize.")
parser_11.add_argument("-sum","--SUMMARY",help="Summary file to output.")
parser_11.set_defaults(mode='covSummary')

parser_12.add_argument("-fasta","--FASTA",help="Fasta sequence to simulate reads for.",required=True)
parser_12.add_argument("-cov",'--COVERAGE',help="Coverage of sample to simulate reads for.",type=int,default=10)
parser_12.add_argument("-d",'--DIRECTORY',help="Directory to write output files to.",type=str,default="")
parser_12.add_argument("-id",'--ID',help="ID to label output files.",type=str,default="NA")
parser_12.add_argument("-RL",'--READ_LENGTH',help="Read Length (bp).",type=int,default=100)
parser_12.add_argument("-chr",'--CHROMOSOME',help="List of chromosomes to estimate statistics for.",type=str)
parser_12.add_argument("-se",'--SE',help="Simulate single end reads instead of paired end reads.",type=bool,default=False)
parser_12.set_defaults(mode='simReads')

parser_13.add_argument("-i",'--INPUT',help="Input file containing predicted CNVs, generated by predict function",required=True)
parser_13.add_argument("-o",'--OUTPUT',help="Summary bed file.",required=True)
parser_13.add_argument("-c",'--CUTOFF',help="Confidence cutoff, CNVs below this value are removed.",type=float,default=0.0)
parser_13.add_argument("-w",'--WINDOW_SIZE',help="Window size (bp).",type=int,default=50)
parser_13.add_argument("-dups",'--DUPLICATION',help="Bed file containing duplication loci.")
parser_13.add_argument("-dels","--DELETION",help="Bed file containing deletion loci.")
parser_13.add_argument("-id",'--ID',help="ID to label output files.",type=str,default="NA")
parser_13.set_defaults(mode='summarize')

parser_14.add_argument("-i",'--INPUT',help="Input bed file, generated by fvecTrain.",required=True)
parser_14.add_argument("-o",'--OUTPUT',help="File containing false-positive and true-positive rates for duplications and deletions.",required=True)
parser_14.add_argument('-t','--TRAIN',help='Training file or folder, generated by classify function.', required=True)
parser_14.set_defaults(mode='ROC')

parser_15.add_argument("-i",'--INPUT',help="List of prediction files to quantify CNVs over.",required=True)
parser_15.add_argument("-o",'--OUTPUT',help="File to output CNV windows to.",required=True)
parser_15.add_argument("-gff",'--GFF',help="GFF containing genes or other factor to identify if CNVs are present in each factor.")
parser_15.add_argument("-c",'--CUTOFF',help="Confidence cutoff, CNVs below this value are removed.",type=float,default=0.5)
parser_15.add_argument("-w",'--WINDOW_SIZE',help="Window size (bp).",type=int,default=50)
parser_15.set_defaults(mode='quantify')
# parser_14.add_argument('-foo', '--foo', action='store_true')
# parser_14.set_defaults(mode='readme')

parser.add_argument("-f",'--FUNCTION',help="The function which will be used within the script, the options are: predict, winStat, simCNV, simChr, fvecTrain, fvecSample, recreateTotal, covSummary, winStatExtra, subTrain,summarize",type=str)
parser.add_argument("-d",'--DIRECTORY',help="Path to export simulated files such as beds containing deletions & duplications or simulated fasta")
parser.add_argument("-id",'--ID',help="The sample ID",type=str, default="NA")
parser.add_argument("-i",'--INPUT',help="The input file across the various functions, may differ in format",type=str)
parser.add_argument("-o",'--OUTPUT',help="The output file across the various functions, may differ in format",type=str)
parser.add_argument('-quiet','--QUIET', help="If set, does not print any messages.", action='store_true')

if len(sys.argv)==1:
	parser.print_help()
	sys.exit(1)
args = parser.parse_args()
argsDict = vars(args)
function=args.FUNCTION

"""
files required for input, a training file with the coverages and std dev of different classes
an input bed file with coverages by window
an output bedfile
"""

if argsDict['mode'] in ['predict'] or function == "predict":
	"""
	input file is in the following format:
	CHROMOSOME  START   END STRAIN  COV-5   COV-4   COV-3   COV-2   COV-1   COV COV+1   COV+2   COV+3   COV+4   COV+5   SD-5   SD-4   SD-3   SD-2   SD-1   SD SD+1   SD+2   SD+3   SD+4   SD+5
	Where COV is the average coverage of a window, up to 5 up and downstrain of the focal window, and SD is the standard deviation of coverage in each window
	e.g.
	2L	8000	8249	N	1.073	0.902	1.085	0.927	0.976	1.024	1	1.049	1.183	1.122	0.951	0.141	0.11	0.152	0.067	0.093	0.198	0.163	0.126	0.111	0.117	0.302
	output file is in the following format:
	CHROMOSOME  START   END STRAIN  MEDIAN_COV  PREDICTED_CNV  PROBABILITY  PREDICTED_PLOIDY    PROBABILITY
	e.g.
	2L	8000	8249	N	1.024	N	1.0	1	1.0
	"""
	import pandas as pd
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.datasets import make_classification
	from sklearn.externals import joblib
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import ExtraTreesClassifier
	import os
	if os.path.isfile(args.TRAIN) == True:
		if args.QUIET == False:
			print("Classifying over a single training set")
		clf = joblib.load(args.TRAIN)
		clf2 = joblib.load(args.TRAIN + "2")
		input = args.INPUT
		test_in = pd.read_csv(args.INPUT,header=None,sep="\t")
		output = args.OUTPUT
		test_in2 = test_in.drop(test_in[[0,1,2,3]], axis=1)
		test_Y = []
		test_in2.columns = list(range(0,len(test_in2.columns)))
		test_in2_y = []
		test_in2_yA = []
		test_in2_y2 = []
		test_in2_yA2 = []
		if args.QUIET == False:
			print("Classifying windows")
		test_in2_y.extend(list(clf.predict(test_in2)))
		test_in2_y2.extend(list(clf2.predict(test_in2)))
		test_in2_yA.extend(list(pd.DataFrame(clf.predict_proba(test_in2),columns=None).max(axis=1)))
		test_in2_yA2.extend(list(pd.DataFrame(clf2.predict_proba(test_in2),columns=None).max(axis=1)))
		out_df = pd.DataFrame({"chr":list(test_in[0]), "start":list(test_in[1]), "end":list(test_in[2]), "ID":list(test_in[3]), "coverage":list(test_in2[(len(test_in2.columns)-4)/2]) ,"CNV":test_in2_y,"CNVprob":test_in2_yA,"ploidy":test_in2_y2,"ploidyprob":test_in2_yA2})
		out_df.to_csv(output,sep="\t",index =False,header=None)
	elif os.path.isfile(args.TRAIN) == False and os.path.isdir(args.TRAIN) == True:
		if args.QUIET == False:
			print("Bootstrapping over multiple training sets")
		pathe = args.TRAIN
		if pathe.endswith("/") == False:
			pathe += "/"
		out_bs_1 = pd.DataFrame(columns=[0])
		out_bs_2 = pd.DataFrame(columns=[0])
		count = 0
		test_in = pd.read_csv(args.INPUT,header=None,sep="\t")
		output = args.OUTPUT
		test_in2 = test_in.drop(test_in[[0,1,2,3]], axis=1)
		test_Y = []
		test_in2.columns = list(range(0,len(test_in2.columns)))
		for d,s,f in os.walk(pathe):
			for inf in f:
				if os.path.isfile(pathe + inf) == True and os.path.isfile(pathe + inf + "2") == True:
					if args.QUIET == False:
						print("Processing classifier " + str(count+1))
					clf = joblib.load(pathe + inf)
					clf2 = joblib.load(pathe + inf + "2")
					out_bs_1[count] = list(clf.predict(test_in2))
					out_bs_2[count] = list(clf2.predict(test_in2))
					count += 1
		if args.QUIET == False:
			print("Estimating consensus states")
		bs_1 = list(out_bs_1.mode(axis=1)[0])
		bs_1_prob = list(out_bs_1[out_bs_1 == bs_1].count(axis='columns')/float(len(out_bs_1.columns)))
		bs_2 = list(out_bs_2.mode(axis=1)[0])
		bs_2_prob = list(out_bs_2[out_bs_2 == bs_2].count(axis='columns')/float(len(out_bs_2.columns)))
		out_df = pd.DataFrame({"chr":list(test_in[0]), "start":list(test_in[1]), "end":list(test_in[2]), "ID":list(test_in[3]), "coverage":list(test_in2[(len(test_in2.columns)/4)-1]) ,"CNV":bs_1,"CNVprob":bs_1_prob,"ploidy":bs_2,"ploidyprob":bs_2_prob})
		out_df.to_csv(output,sep="\t",index =False,header=None)

elif argsDict['mode'] in ['classify'] or function == "classify":
	import pandas as pd
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.datasets import make_classification
	from sklearn.externals import joblib
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import ExtraTreesClassifier
	models = {"RFC100":RandomForestClassifier(n_estimators=100), "RFC500":RandomForestClassifier(n_estimators=500), "CNN":MLPClassifier(), "ETC100":ExtraTreesClassifier(n_estimators=100), "ETC500":ExtraTreesClassifier(n_estimators=500), "DTC":DecisionTreeClassifier()}
	training_in = pd.read_csv(args.INPUT,header=None,sep="\t")
	X = training_in.drop(training_in[[0,1,2,3,4]], axis=1)
	X.columns = list(range(0,len(X.columns)))
	Y = list(training_in[3])
	clf = models[args.MODEL]
	clf.fit(X,Y)
	Y2 = list(map(str,list(training_in[4])))
	clf2 = RandomForestClassifier(n_estimators=100)
	clf2.fit(X,Y2)
	joblib.dump(clf, args.OUTPUT)
	joblib.dump(clf2, args.OUTPUT + "2")
	if args.QUIET == False:
		print("Classifier Trained")

elif argsDict['mode'] in ['winStat'] or function == "winStat":
	import pandas as pd
	import numpy as np
	import scipy.stats
	import os
	"""
	input is generated by genomeCoverageBed -d in the following format:
	CHR POS COVERAGE
	Following that, per chromosome, find the median coverage of covered bases.
	Can find median for all chromosomes or a specified set of them, one chromosome ID per line.
	"""
	os.system(bedtools_path + " genomecov -d -ibam " + args.INPUT + " > dudeml_temp_covsperbase.bed")
	if args.QUIET == False:
		print("Calculating median coverage")
	test = pd.read_table("dudeml_temp_covsperbase.bed",header=None)
	covs_median = {}
	splits_median = {}
	for line in open(args.CHROMOSOME):
		i = line.split()[0].rstrip()
		covs_median[i] = test[2][test[2] != 0][test[0] == i].median()
		print(i,covs_median[i])
	if args.SUMMARY is not None:
		out = open(args.SUMMARY,"w")
		for i in covs_median:
			out.write(i + "\t" + str(covs_median[i]) + "\n")
		out.close()
	if args.QUIET == False:
		print("Calculating relative median coverage per window")
	chr_stats = []
	count = 0
	"function takes in a pandas dataframe column and outputs a dataframe containing the start and end of window, as well as window coverage median and standard deviation"
	def rolling_with_step(chr,s, window, step):
		vert_idx_list = np.arange(1, s.size - window, step)
		hori_idx_list = np.arange(window)
		A, B = np.meshgrid(hori_idx_list, vert_idx_list)
		idx_array = A + B
		x_array = s.values[idx_array]
		idx = list(s.index[vert_idx_list + (int(window))])
		med = list(np.around(list(map(np.median, x_array)),4))
		intq = list(np.around(list(map(scipy.stats.iqr, x_array)),4))
		means = list(np.around(list(map(np.mean, x_array)),4))
		std = list(np.around(list(map(np.std, x_array)),4))
		return pd.DataFrame({"chr":chr,"start":vert_idx_list,"end":vert_idx_list + window,"med":med,"iqr":intq,"mean":means,"std":std})
	out_df = pd.DataFrame(columns=["chr","start","end","med","iqr","mean","std"])
	"""
	For each chromosome, divide each base by the chromosome median (or total median).
	Following that, finds the median and standard deviation for windows of a given size
	"""
	for i in covs_median:
		test_chrs = test[test[0] == i]
		test_chrs_3 = test_chrs[2]/covs_median[i]
		wins_step = rolling_with_step(i,test_chrs_3,args.WINDOW_SIZE-1,args.STEP_SIZE)
		if args.QUIET == False:
			print("Chromosome " + str(i) + " processed")
		out_df = pd.concat([out_df,wins_step])
	out_df['chr']=out_df['chr'].astype(str)
	out_df['start']=out_df['start'].astype(int)
	out_df['end']=out_df['end'].astype(int)
	out_df.to_csv(args.OUTPUT,sep="\t",index =False,columns=None,header=None)
	os.remove("dudeml_temp_covsperbase.bed")

elif argsDict['mode'] in ['simChr'] or function == "simChr":
	import pandas as pd
	import numpy as np
	pathOut = args.DIRECTORY
	if pathOut != "" and pathOut.endswith("/") == False:
		pathOut += "/"
	from Bio import SeqIO
	import os
	os.system("cp " + args.FASTA + " " + pathOut + args.ID + "_noCNV.fa")
	#os.system("maskFastaFromBed -fi " + args.FASTA + " -bed " + args.TE + " -fo " + pathOut + args.ID + "_noCNV.fa")
	chrs = []
	chr = {}
	chr2 = {}
	for r in SeqIO.parse(open(pathOut + args.ID + "_noCNV.fa"),"fasta"):
		chrs.append(r.id)
		chr[r.id] = str(r.seq)
		chr2[r.id] = ""
	for line in open(args.cnvBed):
		if line.split()[3].rstrip() == "normal":
			chr2[line.split()[0]] += chr[line.split()[0]][int(line.split()[1]):int(line.split()[2])]
		elif line.split()[3].rstrip() == "del":
			pass
		elif line.split()[3].rstrip() == "dup":
			if float(line.split()[-1].rstrip()) > 1.5:
				for v in range(0,int(line.split()[-1].rstrip())):
					chr2[line.split()[0]] += chr[line.split()[0]][int(line.split()[1]):int(line.split()[2])]
			else:
				chr2[line.split()[0]] += chr[line.split()[0]][int(line.split()[1]):int(line.split()[2])]
				chr2[line.split()[0]] += chr[line.split()[0]][int(line.split()[1]):int(line.split()[2])]
	for i in chrs:
		out = open(pathOut + i + "_" + args.ID + "_CNV.fa","w")
		out.write(">" + i + "\n" + chr2[i] + "\n")
	out.close()
	os.remove(pathOut + args.ID + "_noCNV.fa")

elif argsDict['mode'] in ['fvecTrain'] or function == "fvecTrain":
	import os
	import pandas as pd
	import numpy as np
	import math
	from shutil import copyfile
	pathOut = args.DIRECTORY
	if pathOut != "" and pathOut.endswith("/") == False:
		pathOut += "/"
	def roundup(x):
		return int(math.ceil(x / args.WINDOW_SIZE)) * args.WINDOW_SIZE
	def rounddown(x):
		return int(math.floor(x / args.WINDOW_SIZE)) * args.WINDOW_SIZE
	"""If ignoring TEs is required, due to their inherit weirdness with split reads/coverage, this removes windows with TE sequences."""
	if args.TE is not None:
		os.system(bedtools_path + " intersect -v -wa -a "+ args.INPUT + " -b " + args.TE + " -f " + str(args.CUTOFF) + " > "+ pathOut + "dudeml_temp.bed")
	elif args.TE is None:
		copyfile(args.INPUT, pathOut + "dudeml_temp.bed")
	del_cp = {}
	dup_cp = {}
	dup_temp_1 = open("dup_temp_1.bed","w")
	del_temp_1 = open("del_temp_1.bed","w")
	"""Reformat deletion and duplication windows to find overlapping windows with"""
	for line in open(args.DUPLICATION):
		line = line.rstrip()
		cp = str((float(line.split()[5])*float(line.split()[4])) + ((1-float(line.split()[4])) * 1))
		dup_temp_1.write("\t".join([line.split()[0],str(rounddown(int(line.split()[1]))),str(roundup(int(line.split()[2]))),cp]) + "\n")
	for line in open(args.DELETION):
		line = line.rstrip()
		cp = str((float(line.split()[5])*float(line.split()[4])) + ((1-float(line.split()[4])) * 1))
		del_temp_1.write("\t".join([line.split()[0],str(rounddown(int(line.split()[1]))),str(roundup(int(line.split()[2]))),cp]) + "\n")
	dup_temp_1.close()
	del_temp_1.close()
	os.system(bedtools_path + " makewindows -b dup_temp_1.bed -w " + str(args.WINDOW_SIZE) + " -s " + str(args.STEP_SIZE) + " -i src > dup_temp_2.bed")
	os.system(bedtools_path + " makewindows -b del_temp_1.bed -w " + str(args.WINDOW_SIZE) + " -s " + str(args.STEP_SIZE) + " -i src > del_temp_2.bed")
	for line in open("dup_temp_2.bed"):
		dup_cp[line.split()[0] + "\t" + str(int(line.split()[1]) + 1) + "\t" + line.split()[2]] = line.split()[3]
	for line in open("del_temp_2.bed"):
		del_cp[line.split()[0] + "\t" + str(int(line.split()[1]) + 1) + "\t" + line.split()[2]] = line.split()[3]
	out = open(pathOut + "dudeml_temp2.bed","w")
	for line in open(pathOut + "dudeml_temp.bed"):
		copy = "N"
		line = line.rstrip()
		liner = line.split()
		if line.split()[0] + "\t" + line.split()[1] + "\t" + str(int(line.split()[2])) in dup_cp:
			out.write("\t".join([liner[0],liner[1],liner[2],"dup",dup_cp[line.split()[0] + "\t" + line.split()[1] + "\t" + str(int(line.split()[2]))], "\t".join(line.split()[3:])]) + "\n")
		elif line.split()[0] + "\t" + line.split()[1] + "\t" + str(int(line.split()[2])) in del_cp:
			out.write("\t".join([liner[0],liner[1],liner[2],"del",del_cp[line.split()[0] + "\t" + line.split()[1] + "\t" + str(int(line.split()[2]))], "\t".join(line.split()[3:])]) + "\n")
		else:
			if len(liner) == 5 or len(liner) == 7 or len(liner) == 8:
				out.write("\t".join([liner[0],liner[1],liner[2],"N","1.0", "\t".join(line.split()[3:])]) + "\n")
	out.close()
	v=args.WINDOW_SIZE
	if args.STEP_SIZE is not None:
		v=int(args.STEP_SIZE)
	elif args.STEP_SIZE is None:
		v=int(args.WINDOW_SIZE)
	window_pos = [[0,1,2,3,4,5]] * ((2*args.WINDOWS) + 1)
	output = open(args.OUTPUT,"w")
	count = 0
	for line in open(pathOut + "dudeml_temp2.bed"):
		count += 1
		if count % 100000 == 0:
			if args.QUIET == False:
				print(int(count),"windows processed")
		window_pos += [window_pos.pop(0)]
		window_pos[(2*args.WINDOWS)] = line.rstrip().split()
		class_ud = "N"
		if len(list(set([item[0] for item in window_pos]))) == 1:
			if window_pos[args.WINDOWS][3] == "dup" or window_pos[args.WINDOWS][3] == "Dup":
				class_ud = "Dup"
			elif window_pos[args.WINDOWS][3] == "del" or window_pos[args.WINDOWS][3] == "Del":
				class_ud = "Del"
			cc = 0
			cv = 0
			for k in window_pos:
				if int(k[1]) == int(window_pos[args.WINDOWS][1]) - (v*(args.WINDOWS - cc)):
					cv += 1
				cc += 1
			if cv == len(window_pos):
				cq = [str(window_pos[args.WINDOWS][0]),str(window_pos[args.WINDOWS][1]), str(window_pos[args.WINDOWS][2]), class_ud,str(window_pos[args.WINDOWS][4])]
				for k in window_pos:
					cq.append(str(k[5]))
					cq.append(str(k[6]))
					cq.append(str(k[7]))
					cq.append(str(k[8]))
				output.write("\t".join(cq) + "\n")
	output.close()
	os.remove("dudeml_temp.bed")
	os.remove("dudeml_temp2.bed")
	os.remove("dup_temp_1.bed")
	os.remove("del_temp_1.bed")
	os.remove("dup_temp_2.bed")
	os.remove("del_temp_2.bed")

elif argsDict['mode'] in ['fvecSample'] or function == "fvecSample":
	import os
	import pandas as pd
	import numpy as np
	import gzip
	from shutil import copyfile
	pathOut = args.DIRECTORY
	if pathOut != "" and pathOut.endswith("/") == False:
		pathOut += "/"
	test = pd.read_csv(args.INPUT,header=None,sep="\t")
	if args.OUTPUT.endswith(".gz"):
		output = open(args.OUTPUT.rstrip(".gz"), 'w')
	else:
		output = open(args.OUTPUT,"w")
	if args.TE is not None:
		os.system(bedtools_path + " intersect -v -wa -a "+ args.INPUT + " -b " + args.TE + " -f " + str(args.CUTOFF) + " > "+ pathOut + "dudeml_temp.bed")
	elif args.TE is None:
		copyfile(args.INPUT, pathOut + "dudeml_temp.bed")
	v=args.WINDOW_SIZE
	if args.STEP_SIZE is not None:
		v=int(args.STEP_SIZE)
	elif args.STEP_SIZE is None:
		v=int(args.WINDOW_SIZE)
	window_pos = [[0,1,2,3,4,5]] * ((2*args.WINDOWS) + 1)
	count = 0
	for line in open(pathOut + "dudeml_temp.bed"):
		count += 1
		if count % 100000 == 0:
			if args.QUIET == False:
				print(int(count),"windows processed")
		window_pos += [window_pos.pop(0)]
		window_pos[(2*args.WINDOWS)] = line.rstrip().split()
		if len(list(set([item[0] for item in window_pos]))) == 1:
			cc = 0
			cv = 0
			for k in window_pos:
				if int(k[1]) == int(window_pos[args.WINDOWS][1]) - (v*(args.WINDOWS- cc)):
					cv += 1
				cc += 1
			if cv == len(window_pos):
				cq = [str(window_pos[args.WINDOWS][0]),str(window_pos[args.WINDOWS][1]), str(window_pos[args.WINDOWS][2]), str(args.ID)]
				for k in window_pos:
					cq.append(str(k[3]))
					cq.append(str(k[4]))
					cq.append(str(k[5]))
					cq.append(str(k[6]))
				output.write("\t".join(cq) + "\n")
	if args.OUTPUT.endswith(".gz"):
		os.system("gzip " + args.OUTPUT.rstrip(".gz"))
	os.remove(pathOut + "dudeml_temp.bed")

elif argsDict['mode'] in ['simCNV'] or function == "simCNV":
	import pandas as pd
	import numpy as np
	from Bio import SeqIO
	import random
	import os
	df_del = pd.DataFrame(columns = [1,2,3,4])
	df_dup = pd.DataFrame(columns = [1,2,3,4])
	pathOut = args.DIRECTORY
	if pathOut != "" and pathOut.endswith("/") == False:
		pathOut += "/"
	out = open(pathOut + "chrs.bed","w")
	if args.QUIET == False:
		print("Generating duplication and deletion coordinates")
	for r in SeqIO.parse(open(args.FASTA),"fasta"):
		out.write("\t".join([r.id,"1",str(len(str(r.seq)))]) + "\n")
		dup_lengths = []
		del_lengths = []
		cnv_count = round((len(str(r.seq))/1000000)*args.CNV)
		while len(dup_lengths) < cnv_count:
			x = round(np.random.normal(args.dupLength, args.CNVsize, 1)[0])
			if x > 50:
				dup_lengths.append(x)
		while len(del_lengths) < cnv_count:
			x = round(np.random.normal(args.delLength, args.CNVsize, 1)[0])
			if x > 50:
				del_lengths.append(x)
		dup_start = list(np.random.randint(len(str(r.seq)), size=(1, cnv_count))[0])
		del_start = list(np.random.randint(len(str(r.seq)), size=(1, cnv_count))[0])
		dup_ends = list(map(int,[a + b for a, b in zip(dup_start, dup_lengths)]))
		del_ends = list(map(int,[a + b for a, b in zip(del_start, del_lengths)]))
		dups = pd.DataFrame({1:[r.id]*cnv_count,2:dup_start,3:dup_ends,4:dup_lengths})
		dels = pd.DataFrame({1:[r.id]*cnv_count,2:del_start,3:del_ends,4:del_lengths})
		df_dup = df_dup.append(dups)
		df_del = df_del.append(dels)
	out.close()
	df_dup.to_csv(pathOut + "dup.bed",header=False,index=False,sep="\t")
	df_del.to_csv(pathOut + "del.bed",header=False,index=False,sep="\t")
	os.system(bedtools_path + " sort -i " + pathOut + "dup.bed | " + bedtools_path + " merge -i stdin > " + pathOut + "dup2.bed")
	os.system(bedtools_path + " sort -i " + pathOut + "del.bed | " + bedtools_path + " merge -i stdin > " + pathOut + "del2.bed")
	if args.TE is not None:
		os.system(bedtools_path + " intersect -v -wa -a "+ pathOut + "del2.bed -b " + args.TE + " -f " + str(args.CUTOFF) + " > "+ pathOut + "del3.bed")
		os.system(bedtools_path + " intersect -v -wa -a "+ pathOut + "dup2.bed -b " + args.TE + " -f " + str(args.CUTOFF) + " > "+ pathOut + "dup3.bed")
	elif args.TE is None:
		os.system("cp "+ pathOut + "del2.bed "+ pathOut + "del3.bed")
		os.system("cp "+ pathOut + "dup2.bed "+ pathOut + "dup3.bed")
	os.system(bedtools_path + " intersect -wa -v -a " + pathOut + "dup3.bed -b " + pathOut + "del3.bed > " + pathOut + "dup4.bed")
	os.system(bedtools_path + " intersect -wa -v -a " + pathOut + "del3.bed -b " + pathOut + "dup3.bed > " + pathOut + "del4.bed")
	no_chrs = list(range(1, int(args.NUMBER)+1))
	chr_freq = {}
	for i in no_chrs:
		chr_freq[i] = i/args.NUMBER
	no_chrs = list(range(1, int(args.NUMBER)+1))
	chr_freq = {}
	if args.QUIET == False:
		print("Generating duplication and deletion frequencies")
	for i in no_chrs:
		chr_freq[i] = round(i/args.NUMBER,3)
	for i in ["del","dup"]:
		out = open(pathOut + str(i) + "5.bed","w")
		for line in open(pathOut + i + "4.bed"):
			if i == "del":
				num = random.randint(1,args.NUMBER)
				out.write(line.rstrip() + "\tdel\t" + str(chr_freq[num]) + "\t0\n")
			elif i == "dup":
				num = random.randint(1,args.NUMBER)
				count = np.random.choice([2,3,4,5,6,7,8,9,10], 1, p=[0.5, 0.1, 0.1, 0.05, 0.05,0.05,0.05,0.05,0.05])[0]
				freqs = num/args.NUMBER
				cp = (count*freqs) + ((1-freqs) * 1)
				while cp == 1.0:
					num = random.randint(1,args.NUMBER)
					count = np.random.choice([2,3,4,5,6,7,8,9,10], 1, p=[0.5, 0.1, 0.1, 0.05, 0.05,0.05,0.05,0.05,0.05])[0]
				out.write(line.rstrip() + "\tdup\t" + str(chr_freq[num]) + "\t" + str(count) + "\n")
		out.close()
		for j in chr_freq:
			out = open(pathOut + i + "." + str(j) + ".bed","w")
			for line in open(pathOut + i + "5.bed"):
				if float(line.split()[4]) >= chr_freq[j]:
					out.write(line)
			out.close()
	if args.QUIET == False:
		print("Removing overlaps, generating total file")
	for i in no_chrs:
		print("Creating bedfiles for sample " + str(i))
		os.system("bedtools makewindows -b " + pathOut + "chrs.bed -w 5 > " + pathOut + "normal." + str(i) + ".bed")
		os.system(bedtools_path + " intersect -v -wa -a " + pathOut + "normal." + str(i) + ".bed -b " + pathOut + "dup." + str(i) + ".bed | " + bedtools_path + " intersect -v -wa -a stdin -b " + pathOut + "del." + str(i) + ".bed | " + bedtools_path + " sort -i stdin | " + bedtools_path + " merge -i stdin > " + pathOut + "normal2." + str(i) + ".bed")
		out = open(pathOut + "normal3." + str(i) + ".bed","w")
		for line in open(pathOut + "normal2." + str(i) + ".bed"):
			out.write(line.rstrip() + "\tnormal\t1\t1\n")
		out.close()
		os.system("cat " + pathOut + "normal3." + str(i) + ".bed " + pathOut + "dup." + str(i) + ".bed " + pathOut + "del." + str(i) + ".bed | " + bedtools_path + " sort -i stdin > " + pathOut + "total." + str(i) + ".bed")
		os.remove(pathOut + "normal3." + str(i) + ".bed")
		os.remove(pathOut + "normal2." + str(i) + ".bed")
		os.remove(pathOut + "normal." + str(i) + ".bed")
	os.remove(pathOut + "del.bed")
	os.remove(pathOut + "del2.bed")
	os.remove(pathOut + "del3.bed")
	os.remove(pathOut + "del4.bed")
	os.remove(pathOut + "del5.bed")
	os.remove(pathOut + "dup.bed")
	os.remove(pathOut + "dup2.bed")
	os.remove(pathOut + "dup3.bed")
	os.remove(pathOut + "dup4.bed")
	os.remove(pathOut + "dup5.bed")
	os.remove(pathOut + "chrs.bed")

elif argsDict['mode'] in ['recreateTotal'] or function == "recreateTotal":
	import pandas as pd
	import numpy as np
	from Bio import SeqIO
	import random
	import os
	out = open(pathOut + "chrs.bed","w")
	for r in SeqIO.parse(open(args.FASTA),"fasta"):
		out.write("\t".join([r.id,"1",str(len(str(r.seq)))]) + "\n")
	out.close()
	if args.QUIET == False:
		print("recreating bedfiles for sample")
	pathOut = args.DIRECTORY
	if pathOut != "" and pathOut.endswith("/") == False:
		pathOut += "/"
	os.system("bedtools makewindows -b " + pathOut + "chrs.bed -w 3 > " + pathOut + "normal.bed")
	os.system(bedtools_path + " intersect -v -wa -a " + pathOut + "normal." + str(i) + ".bed -b " + args.DUPLICATION  + " | " + bedtools_path + " intersect -v -wa -a stdin -b " + args.DELETION  + " | " + bedtools_path + " sort -i stdin | " + bedtools_path + " merge -i stdin > " + pathOut + "normal2.bed")
	out = open(pathOut + "normal3.bed","w")
	for line in open(pathOut + "normal2.bed"):
		out.write(line.rstrip() + "\tnormal\t1\t1\n")
	out.close()
	os.system("cat " + pathOut + "normal3.bed " + args.DUPLICATION  + " " + args.DELETION  + " | " + bedtools_path + " sort -i stdin > " + args.OUTPUT)
	os.remove(pathOut + "normal3.bed")
	os.remove(pathOut + "normal2.bed")
	os.remove(pathOut + "normal.bed")

elif argsDict['mode'] in ['covSummary'] or function == "covSummary":
	test = pd.read_csv(args.INPUT,header=None,sep="\t")
	covs_median = {}
	covs_std = {}
	covs_mean = {}
	if args.CHROMOSOME is None:
		chrs = list(test[0].unique())
		for i in chrs:
			test2 = test[2][test[2] != 0][test[0] == i]
			covs_median[i] = test2[2].median()
			covs_mean[i] = test2[2].mean()
			covs_std[i] = test2[2].std()
			print("\t".join(list(map(str,i,covs_median[i],covs_mean[i],covs_std[i]))))
	elif args.CHROMOSOME is not None:
		for line in open(args.CHROMOSOME):
			i = line.split()[0].rstrip()
			test2 = test[2][test[2] != 0][test[0] == i]
			covs_median[i] = test2[2].median()
			covs_mean[i] = test2[2].mean()
			covs_std[i] = test2[2].std()
			print(i,covs_median[i],covs_mean[i],covs_std[i])
		covs_median["total"] = test[2][test[2] != 0].median()
		covs_mean["total"] = test[2][test[2] != 0].mean()
		covs_std["total"] = test[2][test[2] != 0].std()
		if args.QUIET == False:
			print("total",covs_median["total"],covs_mean["total"],covs_std["total"])
	if(isset(args.SUMMARY)):
		out = open(args.SUMMARY,"w")
		for i in covs_median:
			if args.QUIET == False:
				print("\t".join(list(map(str,i,covs_median[i],covs_mean[i],covs_std[i]))))
			out.write("\t".join(list(map(str,i,covs_median[i],covs_mean[i],covs_std[i]))) + "\n")
		out.close()

elif argsDict['mode'] in ['winStatExtra']:
	import pandas as pd
	import numpy as np
	cov = float(args.COVERAGE)
	test = pd.read_csv(args.INPUT,header=None,sep="\t")
	v=100
	if args.STEP_SIZE is not None:
		v=int(args.STEP_SIZE)
	elif args.STEP_SIZE is None:
		v=int(args.WINDOW_SIZE)
	def rolling_with_step(chr,s, window, step):
		vert_idx_list = np.arange(0, s.size - window, step)
		hori_idx_list = np.arange(window)
		A, B = np.meshgrid(hori_idx_list, vert_idx_list)
		idx_array = A + B
		x_array = s.values[idx_array]
		idx = list(s.index[vert_idx_list + (int(window))])
		med = list(np.around(list(map(np.median, x_array)),4))
		std = list(np.around(list(map(np.std, x_array)),4))
		return pd.DataFrame({"chr":chr,"start":vert_idx_list,"end":vert_idx_list+window,"med":med,"std":std})
	out_df = pd.DataFrame(columns=["chr","start","end","med","std"])
	if args.CHROMOSOME is None:
		chrs = list(test[0].unique())
		for i in chrs:
			test_chrs = test[test[0] == i]
			#test_chrs[3] = test_chrs[2]
			test_chrs_3 = test_chrs[2]/cov
			wins_step = rolling_with_step(i,test_chrs_3,args.WINDOW_SIZE,v)
			out_df = pd.concat([out_df,wins_step])
	elif args.CHROMOSOME is not None:
		chrs = []
		for line in open(args.CHROMOSOME):
			chrs.append(line.split()[0].rstrip())
		for i in chrs:
			test_chrs = test[test[0] == i]
			test_chrs_3 = test_chrs[2]/cov
			wins_step = rolling_with_step(i,test_chrs_3,args.WINDOW_SIZE,v)
			out_df = pd.concat([out_df,wins_step])
	out_df = out_df.replace(r'\\n','', regex=True)
	out_df.to_csv(args.OUTPUT,sep="\t",index =False,columns=None,header=None)

elif argsDict['mode'] in ['subTrain'] or function == "subTrain":
	import pandas as pd
	import numpy as np
	if args.NUMBER < 1.0:
		fract = float(args.NUMBER)
		test = pd.read_csv(args.INPUT,header=None,sep="\t")
		out_df = pd.DataFrame(columns=test.columns)
		dict_types = test[3].value_counts().to_dict()
		for i in dict_types:
			if dict_types[i] * fract < 10000.0:
				subwin = test[test[3] ==i]
				out_df = pd.concat([out_df,subwin])
			elif dict_types[i] * fract > 10000.0:
				subwin = test[test[3] ==i].sample(replace = True, frac = fract)
				out_df = pd.concat([out_df,subwin])
	elif args.NUMBER > 1:
		count = int(args.NUMBER)
		test = pd.read_csv(args.INPUT,header=None,sep="\t")
		out_df = pd.DataFrame(columns=test.columns)
		dict_types = test[3].value_counts().to_dict()
		for i in dict_types:
			subwin = test[test[3] ==i].sample(replace = True, n = count)
			out_df = pd.concat([out_df,subwin])
	out_df = out_df.round(3)
	out_df.to_csv(args.OUTPUT,sep="\t",index =False,columns=None,header=None)

elif argsDict['mode'] in ['simReads'] or function == "simReads":
	from Bio import SeqIO
	import os
	cov = args.COVERAGE
	pathOut = args.DIRECTORY
	if pathOut != "" and pathOut.endswith("/") == False:
		pathOut += "/"
	chr_lens = {}
	if args.SE == False:
		for r in SeqIO.parse(open(args.FASTA),"fasta"):
			chr_lens[r.id] = len(str(r.seq))
		if args.CHROMOSOME is not None:
			for line in open(args.CHROMOSOME,"r"):
				chr = line.split()[0].rstrip()
				reads = round(chr_lens[chr]/(2*int(args.READ_LENGTH)))*int(cov)
				os.system(wgsim_path + " -N " + str(reads) + " -1 " + str(args.READ_LENGTH) + " -2 " + str(args.READ_LENGTH) + " " + pathOut + chr + "_" + args.ID + "_CNV.fa " + pathOut + chr + "_1.fq " + pathOut + chr + "_2.fq > stdout")
			for line in open(args.CHROMOSOME,"r"):
				chr = line.split()[0].rstrip()
				os.system("cat " + pathOut + chr + "_1.fq >> " + pathOut + args.ID + "_" + str(args.COVERAGE) + "_1.fq")
				os.system("cat " + pathOut + chr + "_2.fq >> " + pathOut + args.ID + "_" + str(args.COVERAGE) + "_2.fq")
				os.remove(pathOut + chr + "_1.fq")
				os.remove(pathOut + chr + "_2.fq")
		elif args.CHROMOSOME is None:
			for chr in chr_lens:
				reads = round(chr_lens[chr]/(2*int(args.READ_LENGTH)))*int(cov)
				os.system(wgsim_path + " -N " + str(reads) + " -1 " + str(args.READ_LENGTH) + " -2 " + str(args.READ_LENGTH) + " " + pathOut + chr + "_" + args.ID + "_CNV.fa " + pathOut + chr + "_1.fq " + pathOut + chr + "_2.fq > stdout")
			for chr in chr_lens:
				os.system("cat " + pathOut + chr + "_1.fq >> " + pathOut + args.ID + "_" + str(args.COVERAGE) + "_1.fq")
				os.system("cat " + pathOut + chr + "_2.fq >> " + pathOut + args.ID + "_" + str(args.COVERAGE) + "_2.fq")
				os.remove(pathOut + chr + "_1.fq")
				os.remove(pathOut + chr + "_2.fq")
	elif args.SE == True:
		for r in SeqIO.parse(open(args.FASTA),"fasta"):
			chr_lens[r.id] = len(str(r.seq))
		if args.CHROMOSOME is not None:
			for line in open(args.CHROMOSOME,"r"):
				chr = line.split()[0].rstrip()
				reads = round(chr_lens[chr]/(int(args.READ_LENGTH)))*int(cov)
				os.system(wgsim_path + " -N " + str(reads) + " -1 " + str(args.READ_LENGTH) + " " + pathOut + chr + "_" + args.ID + "_CNV.fa " + pathOut + chr + ".fq /dev/null > stdout")
			for line in open(args.CHROMOSOME,"r"):
				chr = line.split()[0].rstrip()
				os.system("cat " + pathOut + chr + ".fq >> " + pathOut + args.ID + "_" + str(args.COVERAGE) + ".fq")
				os.remove(pathOut + chr + ".fq")
		elif args.CHROMOSOME is None:
			for chr in chr_lens:
				reads = round(chr_lens[chr]/(2*int(args.READ_LENGTH)))*int(cov)
				os.system(wgsim_path + " -N " + str(reads) + " -1 " + str(args.READ_LENGTH) + " " + pathOut + chr + "_" + args.ID + "_CNV.fa " + pathOut + chr + ".fq /dev/null > stdout")
			for chr in chr_lens:
				os.system("cat " + pathOut + chr + ".fq >> " + pathOut + args.ID + "_" + str(args.COVERAGE) + ".fq")
				os.remove(pathOut + chr + ".fq")

elif argsDict['mode'] in ['summarize'] or function == "summarize":
	import os
	import sys
	import math
	import shutil
	os.system("grep -w 'Del' " + args.INPUT + " | " + bedtools_path + " sort -i stdin | " + bedtools_path + " merge -c 4,6,7,8,9 -o distinct,mode,mode,mode,mode -d " + str(args.WINDOW_SIZE) + " -i stdin > del_temp_total.bed")
	os.system("grep -w 'Dup' " + args.INPUT + " | " + bedtools_path + " sort -i stdin | " + bedtools_path + " merge -c 4,6,7,8,9 -o distinct,mode,mode,mode,mode -d " + str(args.WINDOW_SIZE) + " -i stdin > dup_temp_total.bed")
	os.system("grep -v 'Dup' " + args.INPUT + " | grep -v 'Del' > non_temp_total.bed")
	if args.DELETION is not None and args.DUPLICATION is not None:
		os.system(bedtools_path + " intersect -wa -wb -a " + args.DELETION + " -b del_temp_total.bed > Del_temp_True-Positive.bed")
		os.system(bedtools_path + " intersect -wa -wb -a " + args.DUPLICATION + " -b dup_temp_total.bed > Dup_temp_True-Positive.bed")
		os.system(bedtools_path + " intersect -wa -v -a " + args.DELETION + " -b del_temp_total.bed > Del_temp_False-Negative.bed")
		os.system(bedtools_path + " intersect -wa -v -a " + args.DUPLICATION + " -b dup_temp_total.bed > Dup_temp_False-Negative.bed")
		os.system(bedtools_path + " intersect -wa -v -a del_temp_total.bed -b " + args.DELETION + " > Del_temp_False-Positive.bed")
		os.system(bedtools_path + " intersect -wa -v -a dup_temp_total.bed -b " + args.DUPLICATION + " > Dup_temp_False-Positive.bed")
		for i in ["Del","Dup"]:
			out = open(i + "_temp_False-Negative2.bed", "w")
			for line in open(i + "_temp_False-Negative.bed"):
				out.write("\t".join([line.split()[0],line.split()[1],line.split()[2],args.ID,i,"1.0","NA","1.0","False-Negative"]) + "\n")
			out.close()
			out = open(i + "_temp_False-Positive2.bed", "w")
			for line in open(i + "_temp_False-Positive.bed"):
				out.write(line.rstrip() + "\tFalse-Positive\n")
			out.close()
			os.system(bedtools_path + " sort -i " + i + "_temp_True-Positive.bed | " + bedtools_path + " merge -c 10,11,12,13,14 -o distinct,mode,mode,mode,mode -i stdin > " + i + "_temp_True-Positive2.bed")
			out = open(i + "_temp_True-Positive3.bed","w")
			for line in open(i + "_temp_True-Positive2.bed"):
				out.write(line.rstrip() + "\tTrue-Positive\n")
			out.close()
		os.system("cat Del_temp_True-Positive3.bed Dup_temp_True-Positive3.bed Dup_temp_False-Positive2.bed Del_temp_False-Positive2.bed Del_temp_False-Negative2.bed Dup_temp_False-Negative2.bed | " + bedtools_path + " sort -i stdin > total_sum_temp.bed")
		out = open(args.OUTPUT,"w")
		for line in open("total_sum_temp.bed"):
			if float(line.split()[5]) > args.CUTOFF:
				out.write(line)
		out.close()
		for k in ["dup_temp_total.bed","del_temp_total.bed","Dup_temp_True-Positive.bed","Del_temp_True-Positive.bed","Del_temp_False-Negative.bed","Dup_temp_False-Negative.bed","Del_temp_False-Positive.bed","Dup_temp_False-Positive.bed","Dup_temp_True-Positive2.bed","Del_temp_True-Positive2.bed","Del_temp_False-Negative2.bed","Dup_temp_False-Negative2.bed","Del_temp_False-Positive2.bed","Dup_temp_False-Positive2.bed","Dup_temp_True-Positive3.bed","Del_temp_True-Positive3.bed","total_sum_temp.bed"]:
			os.remove(k)
	elif args.DELETION is None and args.DUPLICATION is None:
		os.system("cat dup_temp_total.bed del_temp_total.bed | " + bedtools_path + " sort -i stdin > total_sum_temp.bed")
		out = open(args.OUTPUT,"w")
		for line in open("total_sum_temp.bed"):
			if float(line.split()[5]) > args.CUTOFF:
					out.write(line)
		out.close()
		os.remove("dup_temp_total.bed")
		os.remove("del_temp_total.bed")
		os.remove("total_sum_temp.bed")

if argsDict['mode'] in ['ROC'] or function == "ROC":
	import pandas as pd
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.datasets import make_classification
	from sklearn.externals import joblib
	import os
	from itertools import cycle
	from sklearn import svm, datasets
	from sklearn.metrics import roc_curve, auc
	from scipy import interp
	from sklearn import metrics
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import ExtraTreesClassifier
	models = {"RFC100":RandomForestClassifier(n_estimators=100), "RFC500":RandomForestClassifier(n_estimators=500), "CNN":MLPClassifier(), "ETC100":ExtraTreesClassifier(n_estimators=100), "ETC500":ExtraTreesClassifier(n_estimators=500), "DTC":DecisionTreeClassifier()}
	training_in = pd.read_csv(args.INPUT,header=None,sep="\t")
	clf = joblib.load(args.TRAIN)
	out_df = pd.DataFrame(columns=["type","fpr","tpr"])
	for i in ["Del","Dup"]:
		training_in_subA = training_in[training_in[3] == "N" ]
		training_in_subB = training_in[training_in[3] == i]
		training_in_subC = pd.concat([training_in_subA,training_in_subB])
		training_in_sub2 = training_in_subC.drop(training_in_subC[[0,1,2,3,4]], axis=1)
		training_in_sub2.columns = list(range(0,len(training_in_sub2.columns)))
		training_in_subC[3][training_in_subC[3] == "N"] = 2
		training_in_subC[3][training_in_subC[3] == i] = 1
		training_in_sub_prob = np.array(list(clf.predict_proba(training_in_sub2)[:, 1]))
		sub_in = np.array(list(training_in_subC[3].as_matrix()))
		fpr, tpr, threshold = roc_curve(sub_in, training_in_sub_prob, pos_label=2)
		sub_list = pd.DataFrame({"type":i,"fpr":list(fpr),"tpr":list(tpr)})
		out_df = pd.concat([out_df,sub_list])
	out_df.to_csv(args.OUTPUT,sep="\t",index =False)

if argsDict['mode'] in ['quantify'] or function == "quantify":
	import pandas as pd
	import os
	import shutil
	def myround(x, base=args.WINDOW_SIZE):
		return base * round(x/base)
	def factor_counts_gff(row):
		row_counts = []
		t = row.iloc[4:].value_counts()
		row_counts.append(row[0])
		row_counts.append(row[1])
		row_counts.append(row[2])
		row_counts.append(row[3])
		row_counts.append(sum(t[t.index == "N"]))
		row_counts.append(sum(t[t.index == "Del"]))
		row_counts.append(sum(t[t.index == "Dup"]))
		return(row_counts)
	def copy_counts_gff(row):
		row_counts = []
		t = row.iloc[4:].value_counts()
		row_counts.append(row[0])
		row_counts.append(row[1])
		row_counts.append(row[2])
		row_counts.append(row[3])
		row_counts.append(sum(t[t.index == 0.0]))
		row_counts.append(sum(t[t.index == 1.0]))
		row_counts.append(sum(t[t.index == 2.0]))
		row_counts.append(sum(t[t.index == 3.0]))
		row_counts.append(sum(t[t.index == 4.0]))
		row_counts.append(sum(t[t.index == 5.0]))
		row_counts.append(sum(t[t.index == 6.0]))
		row_counts.append(sum(t[t.index == 7.0]))
		row_counts.append(sum(t[t.index == 8.0]))
		row_counts.append(sum(t[t.index == 9.0]))
		row_counts.append(sum(t[t.index >= 10.0]))
		return(row_counts)
	if args.GFF is not None:
		comb_CN = pd.DataFrame(columns=["chr","start","end","gene"])
		comb_CP = pd.DataFrame(columns=["chr","start","end","gene"])
		count = 1
		for line in open(args.INPUT,"r"):
			print("processing " + line.rstrip())
			os.system(bedtools_path + """ intersect -wa -wb -a """ + args.GFF + """ -b """ + line.rstrip() + """ | awk -F "\t" '{print $1"\t"$4"\t"$5"\t"$13"\t"$15"\t"$16"\t"$17"\t"$18}' > dudeml_temp1.bed""")
			os.system(bedtools_path + """ intersect -wa -wb -a """ + args.GFF + """ -b """ + line.rstrip() + """ | awk -F "ID=" '{print $2}' | awk -F ";" '{print $1}' | awk -F "-mRNA-1" '{print $1}' > dudeml_temp2.bed""")
			os.system("paste dudeml_temp1.bed dudeml_temp2.bed > dudeml_temp3.bed")
			os.mkdir('tempDir_bed')
			df = pd.read_csv("dudeml_temp3.bed",header = None,sep="\t")
			df_grouped = df.groupby(8)
			for index, group in df_grouped:
				group.to_csv("tempDir_bed/" + index,sep="\t",index =False,header=False)
				# os.system(bedtools_path + " sort -i tempDir_bed/" + index + " | mergeBed -i stdin -c 4,5,6,7,8,9 -o distinct,mode,median,mode,median,distinct >> dudeml_temp4.bed")
			os.system("""for file in tempDir_bed/*; do """ + bedtools_path + """ sort -i ${file} | """ + bedtools_path + """ merge -i stdin -c 4,5,6,7,8,9 -o distinct,mode,median,mode,median,distinct >> dudeml_temp4.bed; done""")
			#for v in list(df[8].unique()):
			#	sub = df[df[8] == v]
			#	comb_CP4.to_csv("tempDir_bed/" + v ,sep="\t",index =False,header=False)
			#for line in open("dudeml_temp3.bed","r"):
			#	out = open("tempDir_bed/" + line.rstrip().split("\t")[-1],"a")
			#		out.write(line)
			#for d,s,f in os.walk("tempDir_bed/"):
			#	for inf in f:
			#		os.system(bedtools_path + " sort -i tempDir_bed/" + inf + " | mergeBed -i stdin -c 4,5,6,7,8,9 -o distinct,mode,median,mode,median,distinct >> dudeml_temp4.bed")
			shutil.rmtree("tempDir_bed/")
			os.system(bedtools_path + " sort -i dudeml_temp4.bed > dudeml_temp5.bed")
			os.remove("dudeml_temp4.bed")
			# os.system(bedtools_path + " sort -i dudeml_temp3.bed | mergeBed -i stdin -c 4,5,6,7,8,9 -o distinct,mode,median,mode,median,distinct > dudeml_temp4.bed")
			df = pd.read_csv("dudeml_temp5.bed",header = None,sep="\t")
			df.columns = ["chr","start","end","strain","CNV","CNVprob","CP","CPprob","gene"]
			df.loc[(df['CNV'] == "Dup") & (df['CNVprob'] < args.CUTOFF), ['CNV']] = "N"
			df.loc[(df['CNV'] == "Del") & (df['CNVprob'] < args.CUTOFF), ['CNV']] = "N"
			comb_CN['chr'] = df['chr']
			comb_CN['start'] = df['start']
			comb_CN['end'] = df['end']
			comb_CN['gene'] = df['gene']
			comb_CP['chr'] = df['chr']
			comb_CP['start'] = df['start']
			comb_CP['end'] = df['end']
			comb_CP['gene'] = df['gene']
			if pd.isnull(df['strain'][0]) == False:
				comb_CP[str(df['strain'][0])] = df["CP"]
				comb_CN[str(df['strain'][0])] = df["CNV"]
				count += 1
			elif pd.isnull(df['strain'][0]) == True:
				comb_CP[str(count)] = df["CP"]
				comb_CN[str(count)] = df["CNV"]
				count += 1
		comb_CP.to_csv(args.OUTPUT + ".copy_raw.txt",sep="\t",index =False)
		comb_CN.to_csv(args.OUTPUT + ".factor_raw.txt",sep="\t",index =False)
		print("Quantify CNVs in each window.")
		comb_CP2 = comb_CP.apply(copy_counts_gff, axis=1)
		comb_CN2 = comb_CN.apply(factor_counts_gff, axis=1)
		comb_CP3 = pd.DataFrame(comb_CP2)
		comb_CN3 = pd.DataFrame(comb_CN2)
		comb_CP4 = pd.DataFrame()
		comb_CN4 = pd.DataFrame()
		comb_CN4[["chr","start","end","gene","N","Del","Dup"]] = pd.DataFrame(comb_CN3[0].values.tolist(), index= comb_CN3.index)
		comb_CP4[["chr","start","end","gene","0.0","1.0","2.0","3.0","4.0","5.0","6.0","7.0","8.0","9.0",">=10.0"]] = pd.DataFrame(comb_CP3[0].values.tolist(), index= comb_CP3.index)
		comb_CP4.to_csv(args.OUTPUT + ".copy.txt",sep="\t",index =False)
		comb_CN4.to_csv(args.OUTPUT + ".factor.txt",sep="\t",index =False)
		os.remove("dudeml_temp1.bed")
		os.remove("dudeml_temp2.bed")
		os.remove("dudeml_temp3.bed")
		os.remove("dudeml_temp5.bed")
	elif args.GFF is None:
		def copy_counts(row):
			row_counts = []
			t = row.iloc[2:].value_counts()
			row_counts.append(row[0])
			row_counts.append(row[1])
			row_counts.append(row[2])
			row_counts.append(sum(t[t.index == 0.0]))
			row_counts.append(sum(t[t.index == 1.0]))
			row_counts.append(sum(t[t.index == 2.0]))
			row_counts.append(sum(t[t.index == 3.0]))
			row_counts.append(sum(t[t.index == 4.0]))
			row_counts.append(sum(t[t.index >= 5.0]))
			return(row_counts)
		def factor_counts(row):
			row_counts = []
			t = row.iloc[2:].value_counts()
			row_counts.append(row[0])
			row_counts.append(row[1])
			row_counts.append(row[2])
			row_counts.append(sum(t[t.index == "N"]))
			row_counts.append(sum(t[t.index == "Del"]))
			row_counts.append(sum(t[t.index == "Dup"]))
			return(row_counts)
		comb_CN = pd.DataFrame(columns=["chr","start","end"])
		comb_CP = pd.DataFrame(columns=["chr","start","end"])
		count = 1
		for line in open(args.INPUT,"r"):
			print("processing " + line.rstrip())
			df = pd.read_csv(line.rstrip(),header = None,sep="\t")
			df.columns = ["chr","start","end","strain","cov","CNV","CNVprob","CP","CPprob"]
			df.loc[(df['CNV'] == "Dup") & (df['CNVprob'] < args.CUTOFF), ['CNV']] = "N"
			df.loc[(df['CNV'] == "Del") & (df['CNVprob'] < args.CUTOFF), ['CNV']] = "N"
			comb_CN['chr'] = df['chr']
			comb_CN['start'] = df['start']
			comb_CN['end'] = df['end']
			comb_CP['chr'] = df['chr']
			comb_CP['start'] = df['start']
			comb_CP['end'] = df['end']
			if pd.isnull(df['strain'][0]) == False:
				comb_CP[str(df['strain'][0])] = df["CP"]
				comb_CN[str(df['strain'][0])] = df["CNV"]
				count += 1
			elif pd.isnull(df['strain'][0]) == True:
				comb_CP[str(count)] = df["CP"]
				comb_CN[str(count)] = df["CNV"]
				count += 1
		print("Quantify CNVs in each window.")
		comb_CP2 = comb_CP.apply(copy_counts, axis=1)
		comb_CN2 = comb_CN.apply(factor_counts, axis=1)
		comb_CP3 = pd.DataFrame(comb_CP2)
		comb_CN3 = pd.DataFrame(comb_CN2)
		comb_CP4 = pd.DataFrame()
		comb_CP4[["chr","start","end","0","1.0","2.0","3.0","4.0",">=5.0"]] = pd.DataFrame(comb_CN3[0].values.tolist(), index= comb_CN3.index)
		comb_CN4 = pd.DataFrame()
		comb_CN4[["chr","start","end","N","Del","Dup"]] = pd.DataFrame(comb_CN3[0].values.tolist(), index= comb_CN3.index)
		comb_CN4 = comb_CN4.loc[comb_CN4['Del'] != 0 or comb_CN4['Dup'] != 0]
		comb_CP4 = comb_CP4.loc[comb_CN4['Del'] != 0 or comb_CN4['Dup'] != 0]
		comb_CP4.to_csv(args.OUTPUT + ".copy",sep="\t",index =False)
		comb_CN4.to_csv(args.OUTPUT + ".factor",sep="\t",index =False)