-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrim_adapters.py
62 lines (43 loc) · 2.13 KB
/
trim_adapters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
import sys
import os
import math
import subprocess
import argparse
import time
from collections import OrderedDict
def create_output_folders(dataset):
if not os.path.exists("./Trimmed"):
os.mkdir("./Trimmed")
if not os.path.exists("./Trimmed/%s" % dataset):
os.mkdir("./Trimmed/%s" % dataset)
if not os.path.exists("./Logs_trimmed"):
os.mkdir("./Logs_trimmed")
if not os.path.exists("./Logs_trimmed/%s" % dataset):
os.mkdir("./Logs_trimmed/%s" % dataset)
def trim_adapters(dataset, base_name):
zcat_cmd = "zcat Raw_data/%s/%s.fastq.gz" % (dataset, base_name)
fastx_cmd = "fastx_clipper -a AGATCGGAAGAGCACACGTCT -l 18 -c -n -v -Q33 >Trimmed/%s/%s_trimmed.fastq" % (dataset, base_name)
log_fh = open("./Logs_trimmed/%s/%s_trimmed.log" % (dataset, base_name), "w")
p1 = subprocess.Popen(zcat_cmd, shell = True, stdout = subprocess.PIPE)
p2 = subprocess.Popen(fastx_cmd, shell = True, stdin = p1.stdout, stderr = log_fh)
p2.wait()
log_fh.flush()
log_fh.close()
print("Finished trimming %s at " % base_name , time.ctime())
print("Compressing %s..." % base_name)
gzip_cmd = "gzip ./Trimmed/%s/%s_trimmed.fastq" % (dataset, base_name)
p3 = subprocess.Popen(gzip_cmd, shell = True)
p3.wait()
print("Finished compressing %s at " % base_name, time.ctime())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Trim the adapters from raw reads.")
parser.add_argument("dataset", help = "The folder you want to look in for the data, inside the Raw_data folder (S1, S2, etc)")
args = parser.parse_args()
create_output_folders(args.dataset)
data_files = sorted([x for x in os.listdir("./Raw_data/%s/" % args.dataset) if ".fastq.gz" in x])
if len(data_files) == 0:
sys.exit("No files found in \'./Raw_data/%s/\'. Please ensure data files are present and in .fastq.gz format" % args.dataset)
for f in data_files:
base_name = f.split(".")[0]
trim_adapters(args.dataset, base_name)