Skip to content

Commit

Permalink
Add parameter --index-subset to create specialized subsets of precomp…
Browse files Browse the repository at this point in the history
…uted indices
  • Loading branch information
milot-mirdita committed Nov 2, 2022
1 parent 2b4fc41 commit 314c1f0
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 16 deletions.
3 changes: 3 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ Parameters::Parameters():
// indexdb
PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"),
PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries 0: normal index 1: index without headers 1: index without prefiltering data", typeid(int), (void *) &indexSubset, "^[0-2]{1}", MMseqsParameter::COMMAND_EXPERT),
// createdb
PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""),
PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
Expand Down Expand Up @@ -725,6 +726,7 @@ Parameters::Parameters():
indexdb.push_back(&PARAM_SEARCH_TYPE);
indexdb.push_back(&PARAM_SPLIT);
indexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
indexdb.push_back(&PARAM_INDEX_SUBSET);
indexdb.push_back(&PARAM_V);
indexdb.push_back(&PARAM_THREADS);
Expand Down Expand Up @@ -2275,6 +2277,7 @@ void Parameters::setDefaults() {
// indexdb
checkCompatible = 0;
searchType = SEARCH_TYPE_AUTO;
indexSubset = INDEX_SUBSET_NORMAL;

// createdb
createdbMode = SEQUENCE_SPLIT_MODE_HARD;
Expand Down
7 changes: 5 additions & 2 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,9 @@ class Parameters {
static const int OUTFMT_TORFEND = 38;
static const int OUTFMT_FIDENT = 39;



static const int INDEX_SUBSET_NORMAL = 0;
static const int INDEX_SUBSET_NO_HEADERS = 1;
static const int INDEX_SUBSET_NO_PREFILTER = 2;

static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy);
Expand Down Expand Up @@ -529,6 +530,7 @@ class Parameters {
// indexdb
int checkCompatible;
int searchType;
int indexSubset;

// createdb
int identifierOffset;
Expand Down Expand Up @@ -861,6 +863,7 @@ class Parameters {
// indexdb
PARAMETER(PARAM_CHECK_COMPATIBLE)
PARAMETER(PARAM_SEARCH_TYPE)
PARAMETER(PARAM_INDEX_SUBSET)

// createdb
PARAMETER(PARAM_USE_HEADER) // also used by extractorfs
Expand Down
9 changes: 6 additions & 3 deletions src/prefiltering/PrefilteringIndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
DBReader<unsigned int> *alndbr,
BaseMatrix *subMat, int maxSeqLen,
bool hasSpacedKmer, const std::string &spacedKmerPattern,
bool compBiasCorrection, int alphabetSize, int kmerSize,
int maskMode, int maskLowerCase, float maskProb, int kmerThr, int splits) {
bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset) {

const int SPLIT_META = splits > 1 ? 0 : 0;
const int SPLIT_SEQS = splits > 1 ? 1 : 0;
Expand All @@ -82,7 +82,7 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
writer.writeData(metadataptr, sizeof(metadata), META, SPLIT_META);
writer.alignToPageSize(SPLIT_META);

if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false) {
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
int alphabetSize = subMat->alphabetSize;
subMat->alphabetSize = subMat->alphabetSize-1;
ScoreMatrix s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
Expand Down Expand Up @@ -210,6 +210,9 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
(Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
? alphabetSize -1: alphabetSize;

if (indexSubset == Parameters::INDEX_SUBSET_NO_PREFILTER) {
splits = 0;
}
for (int s = 0; s < splits; s++) {
size_t dbFrom = 0;
size_t dbSize = 0;
Expand Down
3 changes: 2 additions & 1 deletion src/prefiltering/PrefilteringIndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ class PrefilteringIndexReader {
DBReader<unsigned int> *hdbr1, DBReader<unsigned int> *hdbr2,
DBReader<unsigned int> *alndbr,
BaseMatrix *seedSubMat, int maxSeqLen, bool spacedKmer, const std::string &spacedKmerPattern,
bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode, int maskLowerCase, float maskProb, int kmerThr, int splits);
bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset = 0);

static DBReader<unsigned int> *openNewHeaderReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads, bool touchIndex, bool touchData);

Expand Down
26 changes: 16 additions & 10 deletions src/util/indexdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,14 @@ int indexdb(int argc, const char **argv, const Command &command) {
}

if (recreate) {
DBReader<unsigned int> hdbr1(hdr1.c_str(), hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
hdbr1.open(DBReader<unsigned int>::NOSORT);
DBReader<unsigned int> *hdbr1 = NULL;
if (par.indexSubset != Parameters::INDEX_SUBSET_NO_HEADERS) {
hdbr1 = new DBReader<unsigned int>(hdr1.c_str(), hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
hdbr1->open(DBReader<unsigned int>::NOSORT);
}

DBReader<unsigned int> *hdbr2 = NULL;
if (sameDB == false && ppDB == false) {
if (sameDB == false && ppDB == false && par.indexSubset != Parameters::INDEX_SUBSET_NO_HEADERS) {
hdbr2 = new DBReader<unsigned int>(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
hdbr2->open(DBReader<unsigned int>::NOSORT);
}
Expand All @@ -152,22 +155,25 @@ int indexdb(int argc, const char **argv, const Command &command) {
}

DBReader<unsigned int>::removeDb(indexDB);
PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, &hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen,
PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen,
par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection,
seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode,
par.maskProb, kmerScore, par.split);
par.maskProb, kmerScore, par.split, par.indexSubset);

if (alndbr != NULL) {
alndbr->close();
delete alndbr;
}

if (hdbr2 != NULL) {
hdbr2->close();
delete hdbr2;
}

if (alndbr != NULL) {
alndbr->close();
delete alndbr;
if (hdbr1 != NULL) {
hdbr1->close();
delete hdbr1;
}

hdbr1.close();
}

if (dbr2 != NULL) {
Expand Down

0 comments on commit 314c1f0

Please sign in to comment.