-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastq_batch_searcher.py
68 lines (52 loc) · 2.26 KB
/
fastq_batch_searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
"""
This file takes a folder filled with .fastq files and will search all of them for a given
sequence.
You must be outside the folder, in a directory containing this file and the desired folder
It will search all files in the folder and count the number of times the given sequence is seen in each.
The results will be printed to 'Summary_file.txt' in the directory
Summary file will have a tab delimited array containing the following:
Filename Number of reads Number of times restriction sequence was seen Number of reads with restriction enzyme sequence
file1.txt 400000 2000 1738
input the following:
python3 fastq_batch_searcher.py folder_name -r ATGCAT
where ATGCAT is the sequence you're searching for and folder_name is the folder contianing the fastq files
"""
import os
import argparse
import itertools
parser = argparse.ArgumentParser()
parser.add_argument("input", type = str, help="input folder containing Fastq files ")
parser.add_argument("-r", "--restriction_enzyme", type=str, help="The restriction enzyme sequence you're searching for")
args = parser.parse_args()
#open the directory
dir_name= args.input
here_we_are = os.getcwd()
new_dir = here_we_are + '/' + dir_name
os.chdir(new_dir)
header = 'File\tNumber of reads\tNumber of times restriction sequence was seen\tNumber of reads with restriction enzyme sequence\n'
filex = open('Summary_file.txt','w')
filex.write(header)
filex.close()
files = os.listdir(new_dir)
fastq_files = [i for i in files if i.endswith('.fastq')]
for file in fastq_files:
update = "counting records from file " + file
print(update)
with open(file) as input:
num_lines = sum([1 for line in input])
total_records = int(num_lines / 4)
restriction_count_number = 0
num_of_lines = 0
with open(file) as f:
seq_lines = itertools.islice(f, 1, None, 4)
print('searching for: ' + args.restriction_enzyme +' in ' + file)
for line in seq_lines:
if args.restriction_enzyme in line:
num_of_times = line.count(args.restriction_enzyme)
restriction_count_number += num_of_times
num_of_lines += 1
outstring1="%s\t%d\t%d\t%d\n" % (file, total_records, restriction_count_number, num_of_lines)
with open('Summary_file.txt', "a") as outfile:
outfile.write(outstring1)
os.chdir('..')