-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sequence.h
156 lines (140 loc) · 4.85 KB
/
Sequence.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/*
* Sequence.h
*
* Created on: Oct 13, 2017
* Author: simon
*/
#ifndef SEQUENCE_H_
#define SEQUENCE_H_
#include <algorithm>
#include <sys/stat.h>
#include <unistd.h>
#include <unordered_map>
#include <vector>
#include <numeric>
#include <regex>
#include <cassert>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
const std::string AA_ABET = "ARNDCQEGHILKMFPSTWYV";
#define BIG_NUMBER 1000000
// Basic class for sequences
class CSequence {
public:
CSequence() { }; // Blank constructor
CSequence(std::string name, std::string seq); // Standard constructor
// std::string name;
// std::string seq;
std::vector <bool> Inside; // Whether the character is on the inside or outside
std::vector <bool> Remove; // Whether to remove in the filter (true = remove)
double PropInside; // The proportion of the sequence labeled inside
double PropRemoved; // The proportion of hte sequence labeled to be removed
bool AllRemoved() { return _allRemoved; }
void AddSequence(std::string seq);
void AddName(std::string name);
static void SetFilter(char filterOut) { _filterOut = filterOut; };
int length() { return _seq.size(); }
static int MaxLength() { return _maxLength; }
std::string RealSeq(int pos = -1); // Outputs the unfiltered seq
std::string Seq(int pos = -1, bool filter = true, bool showOutside = false); // Output the sequence (or pos i of sequence)
std::string Name() { return _name; }
bool Filter(int pos); // Whether pos should be filtered/removed in any way
std::string out() { return _name + " " + _seq; }
void CalculateSummary() {
int in = 0, rem = 0;
for(int i = 0; i < length(); i++) {
if(Inside[i]) { in++; }
if(Remove[i]) { rem++; }
}
PropInside = (double) in / (double) length();
PropRemoved = (double) rem / (double) length();
if(rem == length()) { _allRemoved = true; }
}
private:
static int _maxLength; // Maximum length of the sequences examined
std::string _name; // The sequence
std::string _seq; // The name
static char _filterOut; // The string output on filtering
bool _allRemoved = false; // Whether the sequence is fully removed
void InitialiseFlags() {
assert(Inside.empty() && Remove.empty());
Inside.assign(_seq.size(),true);
Remove.assign(_seq.size(),false);
}
};
// File readers
enum EFileType { FASTA, MSF, Phylip, Interleaved };
inline std::string FileTypeName(EFileType type) {
switch(type) {
case FASTA:
return "FASTA";
case MSF:
return "MSF";
case Phylip:
return "Phylip";
case Interleaved:
return "Interleaved";
default:
std::cout << "\nUknown FileTypeName..."; exit(-1);
}
}
EFileType TestFile(std::string seqFile);
std::vector <CSequence> *ReadSequences(std::string seqFile);
std::vector <CSequence> *FASTAReader(std::string seqFile);
std::vector <CSequence> *MSFReader(std::string seqFile);
std::vector <CSequence> *PhylipReader(std::string seqFile);
std::vector <CSequence> *InterleavedReader(std::string seqFile);
// Other minor tools
template <class TRange> bool InRange(TRange Val, TRange LowerBound, TRange UpperBound) { return ( !(Val < LowerBound) && ( Val < UpperBound) ); }
#define my_min(a,b) ((a)<(b)?(a):(b))
#define my_max(a,b) ((a)>(b)?(a):(b))
std::string RemoveWhiteSpace(std::string s);
std::vector <std::string> Tokenise(std::string line); // Tokenise a string
std::vector <std::string> Tokenise(std::string line, std::string Delim); // Tokenise a string according to delimiter Delim
inline void ProgressSpinner(int suffix = -1) {
static int count = 0;
static char progress_spinner [] = "/-\\|";
printf("\r%c",progress_spinner[count++]);
if(suffix >= 0) { printf(" %d",suffix); }
fflush(stdout);
if(count == 4) { count = 0; }
};
inline bool replace(std::string& str, const std::string& from, const std::string& to) {
size_t start_pos = str.find(from);
if(start_pos == std::string::npos)
return false;
str.replace(start_pos, from.length(), to);
return true;
}
inline bool file_exist (const std::string& name) {
struct stat buffer;
return (stat (name.c_str(), &buffer) == 0);
}
inline std::string read_line(std::istream &in) {
std::string tmp;
getline(in,tmp);
if(!in.good()) { std::cout << "\nError reading file..."; exit(-1); }
return tmp;
}
inline bool IsGap(char c) {
std::string gaps = ".*-X?";
if(std::find(gaps.begin(),gaps.end(),c) != gaps.end()) { return true; }
return false;
}
inline bool IsSeq(char c) {
if(std::find(AA_ABET.begin(),AA_ABET.end(),c) != AA_ABET.end()) { return true; }
return false;
}
template <typename T>
std::vector<int> ordered(std::vector<T> const& values) {
std::vector<int> indices(values.size());
std::iota(begin(indices), end(indices), static_cast<int>(0));
std::sort(
begin(indices), end(indices),
[&](int a, int b) { return values[a] < values[b]; }
);
return indices;
}
#endif /* SEQUENCE_H_ */