-
Notifications
You must be signed in to change notification settings - Fork 0
/
guessPartition.py
executable file
·147 lines (107 loc) · 5.69 KB
/
guessPartition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/python3
'''
usage: guessPartition.py [-h] [--all] [--samples SAMPLES] [--input INPUT] [--debug] partitions
Guesses good values for partitioning lists of kmers based on a subset of example data.
positional arguments:
partitions The number of buckets of data you want to generate. The output will be a series of kmer prefixes one fewer than the number of partitions.
optional arguments:
-h, --help show this help message and exit
--all Parse _all_ values in the source file. Useful if the source file is sorted.
--samples SAMPLES The number of samples to use (overrides the defailt)
--input INPUT The input file to read from (overrides the default: STDIN)
--debug Output diagnostic data to STDERR
NOTE: This program requires UNSORTED input for it to function with sampling. Otherwise you should use --all to ensure that you don't get worst-case behavior.
This program's function is to guess a good place to partition the kmer space
based on:
- How many partitions are needed, and
- The variation in the first hundred thousand records in the largest (any?) genome.
NOTE NOTE NOTE:
If the input is sorted, this partitioning will be quite wrong. If that is
the case for you, you should use --all.
The input is a number of partitions, specified on the commandline, and the
filename of a file that lists kmers as the first element on the line
(starting at the first column) with minimal repetition of kmers.
While we do sort(), we limit the cost by limiting the number of lines. On
the author's system, this takes under a tenth of a second, and lets us maximize
our use of memory for speed in dependent scripts.
Samples values as low as 1000 are likely fine, but don't offer significant time
savings. The more samples, the more evenly distributed the kmers should be,
with significantly diminishing returns.
'''
import sys as sys
import argparse as argparse
import logging as logging
def getGuessPartitionArgs(override=None):
parser = argparse.ArgumentParser(description='Guesses good values for partitioning lists of kmers based on a subset of example data.', epilog="NOTE: This program requires UNSORTED input for it to function with sampling. Otherwise you should use --all to ensure that you don't get worst-case behavior.")
parser.add_argument('partitions', action='store', help='The number of buckets of data you want to generate. The output will be a series of kmer prefixes one fewer than the number of partitions.')
parser.add_argument('--all', action='store_true', help='Parse _all_ values in the source file. Useful if the source file is sorted.')
parser.add_argument('--samples', action='store', help='The number of samples to use (overrides the defailt)', default=100000)
parser.add_argument('--input', action='store', help='The input file to read from (overrides the default: STDIN)', default=sys.stdin)
parser.add_argument('--debug', action='store_true', help='Output diagnostic data to STDERR')
if override is None:
return parser.parse_args()
else:
return parser.parse_args(override)
def moreCleverSortKey(kmer):
# UNUSED
# Will sort SNPs to be adjacent.
output = ''
centerPos = int(len(kmer) / 2) - 1
for count in range( centerPos ):
output = output + kmer[count] + kmer[-count]
output = output + kmer[centerPos]
return output
def getKeygenFunction(N):
'''
Produces a function to quickly generate a key for a string
where the Nth character is the center character. Maybe faster than
the above? Worth testing. Yay lambda and recursion!
'''
if N == 1:
return lambda a: a
else:
return lambda a: a[0] + a[-1] + getKeygenFunction(N-1)(a[1:-1])
if __name__ == '__main__':
options = getGuessPartitionArgs()
if options.debug:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
else:
logging.basicConfig(format='%(levelname)s:%(message)s')
logging.debug('Commandline options, as parsed: %s', str(options))
if not options.all:
maxLines = int(options.samples)
# Performance drops off significantly below 4k and above ~128k
bufferSize=32*1024
# Ensure input file is open.
inputFile = options.input
if inputFile == sys.stdin:
input = inputFile
else:
input = open(inputFile,'r')
array = []
batch = input.readlines(bufferSize)
while batch != []:
array.extend(batch)
if not options.all and len(array) > maxLines:
break
batch = input.readlines(bufferSize)
input.close()
# This or the I/O is going to take the bulk of the time. If we have
# to use swap to hold the data, this will probably overwhelm.
array.sort()
# Assume that the kmer is the first item on the line, and that
# the line is space-separated.
logging.debug("Length of the sorted array: %s", len(array))
k = len(array[0].split(' ')[0])
# Identify the center position of the kmer (this is the location
# of the SNP)
centerPos = int((k-1)/2)
buckets = int(options.partitions)
partitionStep = int(len(array)/buckets)
for X in range(1,buckets):
# We need to make sure we don't establish a partition in the middle of a locus.
# To ensure this, we truncate the output to avoid including the center nucleotide
# If more granular sorting is required (more granular than the first half of the
# kmer permits), you'll need a more clever sort key (but it is possible)
logging.debug("X: %s, partitionStep: %s, index: %s, array[index]: %s ",X, partitionStep, X*partitionStep, array[X*partitionStep])
print( array[X * partitionStep].strip()[:centerPos] )