diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 0a346f16b..651f50037 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -108,6 +108,7 @@ extern int search(int argc, const char **argv, const Command& command); extern int linsearch(int argc, const char **argv, const Command& command); extern int sortresult(int argc, const char **argv, const Command& command); extern int splitdb(int argc, const char **argv, const Command& command); +extern int setextendeddbtype(int argc, const char **argv, const Command& command); extern int splitsequence(int argc, const char **argv, const Command& command); extern int subtractdbs(int argc, const char **argv, const Command& command); extern int suffixid(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 98db5acb1..f5cc29e9d 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -793,8 +793,13 @@ std::vector baseCommands = { CITATION_MMSEQS2, {{"resultDBLeft", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"resultDBRight", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}}, - - + {"setextendeddbtype", setextendeddbtype, &par.extendeddbtype, COMMAND_DB, + "Write an extended DB ", + "# Print entries with keys 1, 2 and 3 from a sequence DB to stdout\n" + "mmseqs setextendedbtype db --extended-dbtype 2\n", + "Martin Steinegger ", + "", + CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}}, {"view", view, &par.view, COMMAND_DB, "Print DB entries given in --id-list to stdout", diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h index e2827bd90..17d637f12 100644 --- a/src/commons/IndexReader.h +++ b/src/commons/IndexReader.h @@ -74,8 +74,28 @@ class IndexReader { } if (sequenceReader == NULL) { - if ((databaseType & USER_SELECT) == false && databaseType & (HEADERS | SRC_HEADERS)) { - failSuffix = "_h"; + if((databaseType & USER_SELECT) == false && failSuffix == "" ){ + if (databaseType & HEADERS) { + failSuffix = "_h"; + } else if (databaseType & SRC_HEADERS) { + failSuffix = "_seq_h"; + if(FileUtil::fileExists((dataName + "_seq_h.dbtype").c_str())==false){ + failSuffix = "_h"; + } + } else if (databaseType & SRC_SEQUENCES) { + failSuffix = "_seq"; + if(FileUtil::fileExists((dataName + "_seq.dbtype").c_str())==false){ + failSuffix = ""; + } + } else if (databaseType & SEQUENCES) { + failSuffix = ""; + } else if (databaseType & ALIGNMENTS) { + if(FileUtil::fileExists((dataName + "_clu.dbtype").c_str())){ + failSuffix = "_clu"; + }else{ + failSuffix = "_aln"; + } + } } sequenceReader = new DBReader( (dataName + failSuffix).c_str(), (dataName + failSuffix + ".index").c_str(), diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 88e8a45a7..9036409e5 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -188,7 +188,8 @@ Parameters::Parameters(): // indexdb PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"), - PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-3]{1}", MMseqsParameter::COMMAND_EXPERT), + PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\n4: index without aln (for cluster db)\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-7]{1}", MMseqsParameter::COMMAND_EXPERT), + PARAM_INDEX_DBSUFFIX(PARAM_INDEX_DBSUFFIX_ID, "--index-dbsuffix", "Index dbsuffix", "A suffix of the db (used for cluster dbs)", typeid(std::string), (void *) &indexDbsuffix, "", MMseqsParameter::COMMAND_HIDDEN), // createdb PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""), PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"), @@ -197,6 +198,8 @@ Parameters::Parameters(): PARAM_SHUFFLE(PARAM_SHUFFLE_ID, "--shuffle", "Shuffle input database", "Shuffle input database", typeid(bool), (void *) &shuffleDatabase, ""), PARAM_WRITE_LOOKUP(PARAM_WRITE_LOOKUP_ID, "--write-lookup", "Write lookup file", "write .lookup file containing mapping from internal id, fasta id and file number", typeid(int), (void *) &writeLookup, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT), PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use header DB", "use the sequence header DB instead of the body to map the entry keys", typeid(bool), (void *) &useHeaderFile, ""), + // setextendeddbtype + PARAM_EXTENDED_DBTYPE(PARAM_EXTENDED_DBTYPE_ID, "--extended-dbtype", "Extended dbtype", "Set extended dbtype 1: compressed, 2: need src, 4: context pseudoe cnts", typeid(int), (void *) &extendedDbtype, "^[0-4]{1}"), // splitsequence PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "Overlap between sequences", typeid(int), (void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"), PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"), @@ -752,6 +755,7 @@ Parameters::Parameters(): indexdb.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE); indexdb.push_back(&PARAM_MAX_SEQ_LEN); indexdb.push_back(&PARAM_MAX_SEQS); + indexdb.push_back(&PARAM_INDEX_DBSUFFIX); indexdb.push_back(&PARAM_MASK_RESIDUES); indexdb.push_back(&PARAM_MASK_PROBABILTY); indexdb.push_back(&PARAM_MASK_LOWER_CASE); @@ -918,6 +922,10 @@ Parameters::Parameters(): subtractdbs.push_back(&PARAM_COMPRESSED); subtractdbs.push_back(&PARAM_V); + // setextendeddbtype + extendeddbtype.push_back(&PARAM_EXTENDED_DBTYPE); + extendeddbtype.push_back(&PARAM_V); + // clusthash clusthash.push_back(&PARAM_SUB_MAT); clusthash.push_back(&PARAM_ALPH_SIZE); @@ -2324,6 +2332,7 @@ void Parameters::setDefaults() { checkCompatible = 0; searchType = SEARCH_TYPE_AUTO; indexSubset = INDEX_SUBSET_NORMAL; + indexDbsuffix = ""; // createdb createdbMode = SEQUENCE_SPLIT_MODE_HARD; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 2693c7b7e..4050d4674 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -186,6 +186,8 @@ class Parameters { static const int INDEX_SUBSET_NORMAL = 0; static const int INDEX_SUBSET_NO_HEADERS = 1; static const int INDEX_SUBSET_NO_PREFILTER = 2; + static const int INDEX_SUBSET_NO_ALIGNMENT = 4; + static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy); @@ -546,6 +548,7 @@ class Parameters { int checkCompatible; int searchType; int indexSubset; + std::string indexDbsuffix; // createdb int identifierOffset; @@ -610,6 +613,9 @@ class Parameters { float overlap; int msaType; + // setextendeddbtype + int extendedDbtype; + // extractalignedregion int extractMode; @@ -892,6 +898,7 @@ class Parameters { PARAMETER(PARAM_CHECK_COMPATIBLE) PARAMETER(PARAM_SEARCH_TYPE) PARAMETER(PARAM_INDEX_SUBSET) + PARAMETER(PARAM_INDEX_DBSUFFIX) // createdb PARAMETER(PARAM_USE_HEADER) // also used by extractorfs @@ -904,6 +911,9 @@ class Parameters { // convert2fasta PARAMETER(PARAM_USE_HEADER_FILE) + // setextendedbtype + PARAMETER(PARAM_EXTENDED_DBTYPE) + // split sequence PARAMETER(PARAM_SEQUENCE_OVERLAP) PARAMETER(PARAM_SEQUENCE_SPLIT_MODE) @@ -1116,6 +1126,7 @@ class Parameters { std::vector offsetalignment; std::vector proteinaln2nucl; std::vector subtractdbs; + std::vector extendeddbtype; std::vector diff; std::vector concatdbs; std::vector mergedbs; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index f8498ed0c..3b2638fbf 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -57,8 +57,10 @@ set(util_source_files util/cpmvrmlndb.cpp util/extractframes.cpp util/sequence2profile.cpp + util/setextendeddbtype.cpp util/sortresult.cpp util/splitdb.cpp + util/setextendeddbtype.cpp util/splitsequence.cpp util/subtractdbs.cpp util/summarizealis.cpp diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp index 906695445..61fbabdc8 100644 --- a/src/util/indexdb.cpp +++ b/src/util/indexdb.cpp @@ -51,18 +51,39 @@ int indexdb(int argc, const char **argv, const Command &command) { const bool sameDB = (par.db1 == par.db2); std::string alnDbtypeFile = par.db1 + "_aln.dbtype"; - std::string seqDbtypeFile = par.db1 + "_seq.dbtype"; - const bool ppDB = FileUtil::fileExists(alnDbtypeFile.c_str()) && FileUtil::fileExists(seqDbtypeFile.c_str()); - - std::string db2 = ppDB ? par.db1 + "_seq" : par.db2; - std::string db2Index = ppDB ? par.db1 + "_seq.index" : par.db2Index; + std::string alnFile = par.db1 + "_aln"; + std::string alnIndexFile = par.db1 + "_aln.index"; + if(FileUtil::fileExists(alnDbtypeFile.c_str()) == false){ + alnDbtypeFile = par.db1 + "_clu.dbtype"; + alnFile = par.db1 + "_clu"; + alnIndexFile = par.db1 + "_clu.index"; + } - std::string hdr1 = ppDB ? par.db1 + "_seq_h" : par.hdr1; - std::string hdr1Index = ppDB ? par.db1 + "_seq_h.index" : par.hdr1Index; DBReader dbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); dbr.open(DBReader::NOSORT); + // remove par.indexDbsuffix from db1 + std::string seqDb = par.db1 + "_seq"; + std::string seqDbIndex = par.db1 + "_seq.index"; + std::string seqDbtypeFile = par.db1 + "_seq.dbtype"; + if (par.indexDbsuffix != "") { + std::string::size_type pos = par.db1.find(par.indexDbsuffix); + if (pos != std::string::npos) { + par.db1 = par.db1.substr(0, pos); + } + seqDb = par.db1 + "_seq" + par.indexDbsuffix; + seqDbIndex = par.db1 + "_seq" + par.indexDbsuffix + ".index"; + seqDbtypeFile = par.db1 + "_seq" + par.indexDbsuffix + ".dbtype"; + } + const bool ppDB = FileUtil::fileExists(alnDbtypeFile.c_str()) && FileUtil::fileExists(seqDbtypeFile.c_str()); + + std::string db2 = ppDB ? seqDb : par.db2; + std::string db2Index = ppDB ? seqDbIndex : par.db2Index; + + std::string hdr1 = ppDB ? seqDb + "_h" : par.hdr1; + std::string hdr1Index = ppDB ? seqDb + "_h.index" : par.hdr1Index; + DBReader *dbr2 = NULL; if ((sameDB == false) || ppDB) { dbr2 = new DBReader(db2.c_str(), db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); @@ -102,7 +123,7 @@ int indexdb(int argc, const char **argv, const Command &command) { // query seq type is actually unknown here, but if we pass DBTYPE_HMM_PROFILE then its +20 k-score int kmerScore = Prefiltering::getKmerThreshold(par.sensitivity, isProfileSearch, contextPseudoCnts, par.kmerScore.values, par.kmerSize); - const std::string& baseDB = ppDB ? par.db1 : par.db2; + const std::string& baseDB = ppDB ? par.db1 + par.indexDbsuffix : par.db2; std::string indexDB = PrefilteringIndexReader::indexName(baseDB); int status = EXIT_SUCCESS; @@ -150,8 +171,10 @@ int indexdb(int argc, const char **argv, const Command &command) { } DBReader *alndbr = NULL; - if (ppDB == true) { - alndbr = new DBReader((par.db1 + "_aln").c_str(), (par.db1 + "_aln.index").c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); + const bool noAlignment = (par.indexSubset & Parameters::INDEX_SUBSET_NO_ALIGNMENT) != 0; + if (ppDB == true && noAlignment == false) { + alndbr = new DBReader(alnFile.c_str(), alnIndexFile.c_str(), + par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); alndbr->open(DBReader::NOSORT); } diff --git a/src/util/mergeresultsbyset.cpp b/src/util/mergeresultsbyset.cpp index 0feee8a91..759efcb59 100644 --- a/src/util/mergeresultsbyset.cpp +++ b/src/util/mergeresultsbyset.cpp @@ -2,6 +2,7 @@ #include "DBReader.h" #include "DBWriter.h" #include "Util.h" +#include "IndexReader.h" #ifdef OPENMP #include @@ -14,10 +15,26 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) { DBReader setReader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); setReader.open(DBReader::LINEAR_ACCCESS); - DBReader resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - resultReader.open(DBReader::NOSORT); +// DBReader resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); +// resultReader.open(DBReader::NOSORT); - DBWriter dbw(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, resultReader.getDbtype()); + + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + unsigned int databaseType = IndexReader::USER_SELECT; + int dbtype = FileUtil::parseDbType(par.db2.c_str()); + if(Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_INDEX_DB)|| + Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_AMINO_ACIDS)|| + Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)|| + Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE)){ + databaseType = IndexReader::ALIGNMENTS; + } + IndexReader resultReader(par.db2, par.threads, + databaseType, + (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + + int dbType = resultReader.sequenceReader->getDbtype(); + dbType = DBReader::setExtendedDbtype(dbType, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC); + DBWriter dbw(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, dbType); dbw.open(); #pragma omp parallel { @@ -35,12 +52,12 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) { while (*data != '\0'){ Util::parseKey(data, dbKey); unsigned int key = Util::fast_atoi(dbKey); - size_t id = resultReader.getId(key); + size_t id = resultReader.sequenceReader->getId(key); if (id == UINT_MAX) { Debug(Debug::ERROR) << "Invalid key " << key << " in entry " << i << ".\n"; EXIT(EXIT_FAILURE); } - buffer.append(resultReader.getData(id, thread_idx)); + buffer.append(resultReader.sequenceReader->getData(id, thread_idx)); data = Util::skipLine(data); } dbw.writeData(buffer.c_str(), buffer.length(), setReader.getDbKey(i), thread_idx); @@ -48,7 +65,6 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) { } } dbw.close(); - resultReader.close(); setReader.close(); return EXIT_SUCCESS; diff --git a/src/util/setextendeddbtype.cpp b/src/util/setextendeddbtype.cpp new file mode 100644 index 000000000..9dab9fb9c --- /dev/null +++ b/src/util/setextendeddbtype.cpp @@ -0,0 +1,21 @@ +#include "Parameters.h" +#include "Util.h" +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "HeaderSummarizer.h" + +#ifdef OPENMP +#include +#endif + +int setextendeddbtype(int argc, const char **argv, const Command& command) { + Parameters& par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + int dbtype = FileUtil::parseDbType(par.db1.c_str()); + // check if dbtype uses isCompressed flag + bool isCompressed = (dbtype & (1 << 31)); + dbtype = DBReader::setExtendedDbtype(dbtype, par.extendedDbtype); + DBWriter::writeDbtypeFile(par.db1.c_str(), dbtype, isCompressed); + return EXIT_SUCCESS; +}