-
Notifications
You must be signed in to change notification settings - Fork 350
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #136 from xosh/benchmark_lcp
Benchmark lcp
- Loading branch information
Showing
16 changed files
with
378 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
* | ||
!.gitignore | ||
!lcp.config | ||
!README.md | ||
!bin/ | ||
!src/ | ||
!visualize/ | ||
!compile_options.config | ||
!Makefile | ||
!results/ | ||
!test_case.config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
include ../../Make.helper | ||
CFLAGS = $(MY_CXX_FLAGS) | ||
SRC_DIR = src | ||
BIN_DIR = bin | ||
LIBS = -lsdsl -ldivsufsort -ldivsufsort64 | ||
|
||
C_OPTIONS:=$(call config_ids,compile_options.config) | ||
TC_IDS:=$(call config_ids,test_case.config) | ||
LCP_IDS:=$(call config_ids,lcp.config) | ||
|
||
|
||
DL = ${foreach TC_ID,$(TC_IDS),$(call config_select,test_case.config,$(TC_ID),2)} | ||
|
||
LCP_EXECS = $(foreach LCP_ID,$(LCP_IDS),$(BIN_DIR)/build_$(LCP_ID)) | ||
|
||
RES_FILES = $(foreach TC_ID,$(TC_IDS),\ | ||
results/$(TC_ID)) | ||
|
||
RESULT_FILE=results/all.txt | ||
|
||
execs: $(BIN_DIR)/prep_sa_bwt $(LCP_EXECS) | ||
|
||
timing: execs $(RES_FILES) | ||
@cat $(RES_FILES) > $(RESULT_FILE) | ||
@cd visualize;make | ||
|
||
$(BIN_DIR)/prep_sa_bwt: $(SRC_DIR)/create_sa_bwt.cpp | ||
@echo "Compiling prep_sa_bwt" | ||
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -L${SDSLLITE}/lib\ | ||
$(SRC_DIR)/create_sa_bwt.cpp -I${SDSLLITE}/include -o bin/prep_sa_bwt $(LIBS) | ||
|
||
precalc%: test_case.config $(DL) lcp.config | ||
$(eval TC_ID:=$(call dim,1,$*)) | ||
$(eval LCP_TEX_NAME:=$(call config_select,lcp.config,$(LCP_ID),3)) | ||
$(eval TC_TEX_NAME:=$(call config_select,test_case.config,$(TC_ID),3)) | ||
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2)) | ||
$(eval TC_SIZE:=$(shell wc -c <$(TC_PATH))) | ||
@echo "Running test case: $(TC_ID)" | ||
@echo "# TC_ID = $(TC_ID)" > results/$(TC_ID) | ||
@echo "# TC_TEX_NAME = $(TC_TEX_NAME)">> results/$(TC_ID) | ||
@echo "# TC_SIZE = $(TC_SIZE)">> results/$(TC_ID) | ||
@$(BIN_DIR)/prep_sa_bwt $(TC_PATH) >> results/$(TC_ID) | ||
|
||
results/%: precalc% | ||
@$(foreach LCP_EXEC,$(LCP_EXECS),$(shell $(LCP_EXEC) >>$@;rm -f lcp_tmp.sdsl isa_tmp.sdsl)) | ||
@rm *.sdsl | ||
|
||
$(BIN_DIR)/build_%: $(SRC_DIR)/create_lcp.cpp lcp.config | ||
$(eval LCP_ID:=$(call dim,1,$*)) | ||
$(eval LCP_TYPE:=$(call config_select,lcp.config,$(LCP_ID),2)) | ||
@echo "Compiling build_$*" | ||
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -DLCP_TYPE="$(LCP_TYPE)" -DLCPID="$(LCP_ID)" -L${SDSLLITE}/lib\ | ||
$(SRC_DIR)/create_lcp.cpp -I${SDSLLITE}/include -o $@ $(LIBS) | ||
|
||
|
||
include ../Make.download | ||
|
||
clean-build: | ||
@echo "Remove executables" | ||
rm -f $(BIN_DIR)/build* | ||
rm -f $(BIN_DIR)/prep* | ||
|
||
clean-result: | ||
@echo "Remove results" | ||
rm -f results/* | ||
|
||
cleanall: clean-build clean-result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Benchmarking LCP algorithms | ||
|
||
## Methodology | ||
|
||
Explored dimensions: | ||
|
||
* lcp algorithms | ||
* test cases | ||
|
||
## Directory structure | ||
|
||
* [bin](./bin): Contains the executables of the project. | ||
* [results](./results): Contains the results of the experiments. | ||
* [src](./src): Contains the source code of the benchmark. | ||
* [visualize](./visualize): Contains a `R`-script which generates | ||
a report in LaTeX format. | ||
|
||
## Prerequisites | ||
|
||
* For the visualization you need the following software: | ||
- [R][RPJ] with package `xtable`. You can install the | ||
package by calling `install.packages("xtable")` in R. | ||
- [pdflatex][LT] to generate the pdf reports. | ||
|
||
## Usage | ||
|
||
* `make timing` compiles the programs, downloads | ||
the test instances, builds the LCP arrays and generates a report located at | ||
`visualize/lcp.pdf`. The raw numbers of the timings | ||
can be found in the `results/all.txt`. | ||
* All created binaries and test results can be deleted | ||
by calling `make cleanall`. | ||
|
||
## Customization of the benchmark | ||
|
||
The project contains several configuration files: | ||
|
||
* [wt.config][LCPCONFIG]: Specify different LCP algorithms. | ||
* [test_case.config][TCCONF]: Specify test instances by ID, path, LaTeX-name | ||
for the report, and download URL. | ||
* [compile_options.config][CCONF]: Specify compile options by option string. | ||
|
||
Note that the benchmark will execute every combination of lcp algorithms and test cases. | ||
|
||
[RPJ]: http://www.r-project.org/ "R" | ||
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex" | ||
[LCPCONFIG]: ./lcp.config "lcp.config" | ||
[TCCONF]: ./test_case.config "test_case.config" | ||
[CCONF]: ./compile_options.config "compile_options.config" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Compile options | ||
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# This file specifies wavelettrees that are used in the benchmark. | ||
# | ||
# Each LCP algorithm is specified by a 4-tupel: LCP_ID;LCP_ALGORITHM;LCP_LATEX_NAME;BWT_NEEDED | ||
# * LCP_ID : An identifier for the index. Only letters and underscores are allowed in ID. | ||
# * LCP_ALGORITHM : Corresponding lcp alogrithm. | ||
# * LCP_LATEX_NAME: LaTeX name for output in the benchmark report. | ||
# * BWT_NEEDED : T(rue) if lcp algorithm needs bwt as input, otherwise F(alse). | ||
kasai;construct_lcp_kasai<8>;lcp-kasai;F | ||
phi_algorithm;construct_lcp_PHI<8>;lcp-$\Phi$;F | ||
semi_extern_phi;construct_lcp_semi_extern_PHI;lcp-semi-extern-$\Phi$;F | ||
go;construct_lcp_go;lcp-go;T | ||
goPhi;construct_lcp_goPHI;lcp-go-$\Phi$;T | ||
bwtb;construct_lcp_bwt_based;lcp-bwt-based;T | ||
bwtb2;construct_lcp_bwt_based2;lcp-bwt-based2;T |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
* | ||
!.gitignore | ||
!create_lcp.cpp | ||
!create_sa_bwt.cpp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#include <sdsl/sdsl_concepts.hpp> | ||
#include <sdsl/int_vector.hpp> | ||
#include <sdsl/construct.hpp> | ||
#include <sdsl/construct_lcp.hpp> | ||
#include <string> | ||
#include <chrono> | ||
|
||
using namespace sdsl; | ||
using namespace std; | ||
using namespace std::chrono; | ||
|
||
#define S(x) #x | ||
#define SX(x) S(x) | ||
|
||
int main(int argc, char** argv) | ||
{ | ||
memory_monitor::start(); | ||
string dir = "."; | ||
string id = "tmp"; | ||
cache_config config(false, dir, id); | ||
|
||
register_cache_file(conf::KEY_TEXT, config); | ||
register_cache_file(conf::KEY_SA, config); | ||
register_cache_file(conf::KEY_BWT, config); | ||
|
||
auto start = high_resolution_clock::now(); | ||
LCP_TYPE(config); | ||
auto stop = high_resolution_clock::now(); | ||
memory_monitor::stop(); | ||
cout << "# " SX(LCPID) "_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl; | ||
cout << "# " SX(LCPID) "_MMPEAK = "<< memory_monitor::peak() << endl; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#include <sdsl/sdsl_concepts.hpp> | ||
#include <sdsl/int_vector.hpp> | ||
#include <sdsl/construct.hpp> | ||
#include <sdsl/construct_sa.hpp> | ||
#include <sdsl/construct_bwt.hpp> | ||
#include <string> | ||
#include <chrono> | ||
#include <iostream> | ||
|
||
using namespace sdsl; | ||
using namespace std; | ||
using namespace std::chrono; | ||
|
||
typedef bit_vector::size_type size_type; | ||
|
||
//argv[1] = test file | ||
int main(int argc, char** argv) | ||
{ | ||
memory_monitor::start(); | ||
string file = argv[1]; | ||
uint8_t num_bytes = 1; // Byte Alphabet | ||
string dir = "."; | ||
string id = "tmp"; | ||
cache_config config(false, dir, id); | ||
|
||
//load text | ||
auto start = high_resolution_clock::now(); | ||
{ | ||
int_vector<8> text; | ||
load_vector_from_file(text, file, num_bytes); | ||
if (contains_no_zero_symbol(text, file)) { | ||
append_zero_symbol(text); | ||
store_to_cache(text, conf::KEY_TEXT, config); | ||
} | ||
register_cache_file(conf::KEY_TEXT, config); | ||
} | ||
auto stop = high_resolution_clock::now(); | ||
memory_monitor::stop(); | ||
cout << "# TXT_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl; | ||
cout << "# TXT_MMPEAK = " << memory_monitor::peak() << endl; | ||
|
||
//construct sa | ||
memory_monitor::start(); | ||
start = high_resolution_clock::now(); | ||
{ | ||
construct_sa<8>(config); | ||
register_cache_file(conf::KEY_SA, config); | ||
} | ||
stop = high_resolution_clock::now(); | ||
memory_monitor::stop(); | ||
cout << "# SA_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl; | ||
cout << "# SA_MMPEAK = " << memory_monitor::peak() << endl; | ||
|
||
//construct bwt | ||
memory_monitor::start(); | ||
start = high_resolution_clock::now(); | ||
{ | ||
construct_bwt<8>(config); | ||
register_cache_file(conf::KEY_BWT, config); | ||
} | ||
stop = high_resolution_clock::now(); | ||
memory_monitor::stop(); | ||
cout << "# BWT_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 <<endl; | ||
cout << "# BWT_MMPEAK = "<< memory_monitor::peak() << endl; | ||
|
||
return 0; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Configuration for test files | ||
# (1) Identifier for test file (consisting of letters, no `.`) | ||
# (2) Path to the test file | ||
# (3) LaTeX name | ||
# (4) Download link (if the test is available online) | ||
ENGLISH;../data/english.200MB;english.200MB;http://pizzachili.di.unipi.it/texts/nlang/english.200MB.gz | ||
DBLPXML;../data/dblp.xml.200MB;dblp.xml.200MB;http://pizzachili.di.unipi.it/texts/xml/dblp.xml.200MB.gz | ||
DNA;../data/dna.200MB;dna.200MB;http://pizzachili.di.unipi.it/texts/dna/dna.200MB.gz | ||
PROTEINS;../data/proteins.200MB;proteins.200MB;http://pizzachili.di.unipi.it/texts/protein/proteins.200MB.gz | ||
SOURCES;../data/sources.200MB;sources.200MB;http://pizzachili.di.unipi.it/texts/code/sources.200MB.gz | ||
INFLUENZA;../data/influenza;influenza;http://pizzachili.dcc.uchile.cl/repcorpus/real/influenza.gz | ||
EINSTEIN-de;../data/einstein.de.txt;einstein-de;http://pizzachili.dcc.uchile.cl/repcorpus/real/einstein.de.txt.gz | ||
EINSTEIN-en;../data/einstein.en.txt;einstein-en;http://pizzachili.dcc.uchile.cl/repcorpus/real/einstein.en.txt.gz | ||
PARA;../data/para;para;http://pizzachili.dcc.uchile.cl/repcorpus/real/para.gz | ||
WORLDLEADER;../data/world_leaders;world-leaders;http://pizzachili.dcc.uchile.cl/repcorpus/real/world_leaders.gz | ||
E-COLI;../data/Escherichia_Coli;E.coli;http://pizzachili.dcc.uchile.cl/repcorpus/real/Escherichia_Coli.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
* | ||
!.gitignore | ||
!Makefile | ||
!lcp-header.tex | ||
!lcp-footer.tex | ||
!lcp.R |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
include ../../../Make.helper | ||
|
||
CONFIG_FILES= ../test_case.config | ||
|
||
all: lcp.pdf | ||
|
||
lcp.pdf: lcp.tex | ||
@echo "Use pdflatex to generate lcp.pdf" | ||
@pdflatex lcp.tex >> LaTeX.Log 2>&1 | ||
|
||
lcp.tex: ../results/all.txt ../../basic_functions.R lcp.R $(CONFIG_FILES) | ||
@echo "Use R to generate lcp.tex" | ||
@R --vanilla < lcp.R > R.log 2>&1 | ||
|
||
clean: | ||
rm -f lcp.pdf lcp.aux lcp.tex fig* \ | ||
lcp.log R.log LaTeX.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
\end{document} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
\documentclass[9pt,a4paper,DIV10]{scrartcl} | ||
\usepackage{booktabs} | ||
\usepackage{array} | ||
\usepackage{ragged2e} | ||
|
||
\begin{document} | ||
|
||
\pagestyle{empty} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
library(xtable) | ||
source("../../basic_functions.R") | ||
|
||
tex_file = "lcp.tex" | ||
|
||
tc_config <- readConfig("../test_case.config",c("TC_ID","PATH","LATEX_NAME","URL")) | ||
lcp_config <- readConfig("../lcp.config",c("LCP_ID","LCP_TYPE","LATEX_NAME","BWT")) | ||
|
||
|
||
make_latex_header <- function(names){ | ||
x <- paste("&&\\multicolumn{2}{c}{", names,"}") | ||
x <- paste(x, collapse=" ") | ||
clines="" | ||
for(i in 1:length(names)){ | ||
clines <- paste(clines,"\\cmidrule{",3*i,"-",3*i+1,"}",sep="") | ||
} | ||
y <- paste("\\toprule",x, "\\\\",clines,"\n") | ||
gsub("_","\\\\_",y) | ||
} | ||
|
||
#read header | ||
sink(tex_file) | ||
cat(paste(readLines("lcp-header.tex"),collapse="\n")) | ||
|
||
maindata <- data_frame_from_key_value_pairs( "../results/all.txt" ) | ||
|
||
names<-c("SA","BWT","LCP","OVERALL") | ||
unitrow <- paste(c("", rep(c("&&Time", "&Space"), length(names)), "\\\\","", rep(c("&&(sec)", "&(\\%)"), length(names)), "\\\\[1ex]"), collapse="", sep='') | ||
|
||
# create a table for each test case | ||
for(i in 1:nrow(maindata)){ | ||
|
||
data<-maindata[i,] | ||
row<-nrow(lcp_config) | ||
size<-data[["TC_SIZE"]] | ||
table<-data.frame(EMPTY=character(row),SATIME=character(row),SASPACE=character(row),EMPTY2=character(row),BWTTIME=character(row),BWTSPACE=character(row),EMPTY3=character(row),LCPTIME=character(row),LCPSPACE=character(row),EMPTY4=character(row),OVERALLTIME=character(row),OVERALLSPACE=character(row),stringsAsFactors=FALSE) | ||
|
||
# gather data | ||
for(l in 1:row){ | ||
table[l,]["SATIME"]<-sprintf("%.2f",data[["SA_TIME"]]) | ||
table[l,]["SASPACE"]<-round(data[["SA_MMPEAK"]]*100/size, digits=0) | ||
|
||
|
||
if(lcp_config[["BWT"]][l]){ | ||
table[l,]["BWTTIME"]<-sprintf("%.2f",data[["BWT_TIME"]]) | ||
table[l,]["BWTSPACE"]<-round(data[["BWT_MMPEAK"]]*100/size, digits=0) | ||
table[l,]["OVERALLTIME"]<-sprintf("%.2f",data[["SA_TIME"]]+data[["BWT_TIME"]]+data[[paste(lcp_config[["LCP_ID"]][l],"_TIME",sep="")]]) | ||
table[l,]["OVERALLSPACE"]<-round(max(data[["SA_MMPEAK"]],data[["BWT_MMPEAK"]],data[[paste(lcp_config[["LCP_ID"]][l],"_MMPEAK",sep="")]])*100/size, digits=0) | ||
} | ||
else{ | ||
table[l,]["BWTTIME"]<-"-" | ||
table[l,]["BWTSPACE"]<-"-" | ||
table[l,]["OVERALLTIME"]<-sprintf("%.2f",data[["SA_TIME"]]+data[[paste(lcp_config[["LCP_ID"]][l],"_TIME",sep="")]]) | ||
table[l,]["OVERALLSPACE"]<-round(max(data[["SA_MMPEAK"]],data[[paste(lcp_config[["LCP_ID"]][l],"_MMPEAK",sep="")]])*100/size, digits=0) | ||
} | ||
|
||
table[l,]["LCPTIME"]<-sprintf("%.2f",data[[paste(lcp_config[["LCP_ID"]][l],"_TIME",sep="")]]) | ||
table[l,]["LCPSPACE"]<-round(data[[paste(lcp_config[["LCP_ID"]][l],"_MMPEAK",sep="")]]*100/size, digits=0) | ||
} | ||
|
||
row.names(table)<-lcp_config[["LATEX_NAME"]] | ||
|
||
# convert and print table | ||
ali <- c("l", rep(c("@{\\hspace{1ex}}l","c","c"), (ncol(table))/3) ) | ||
dig <- c(0, rep(c(0,3,0),(ncol(table))/3 )) | ||
|
||
print( xtable(table, align=ali, digits=dig, | ||
caption = paste("Results for ",as.character(data[["TC_TEX_NAME"]])," (size: ",round(size/(1024^2), digits=3),"MB). Runtime in seconds. Space is the peak memory usage (including input and output) as fraction of original file size.")), | ||
add.to.row=list(pos=list(-1,0,nrow(table)), command=c(make_latex_header(names),unitrow,"\\bottomrule")), | ||
hline.after=c(), | ||
sanitize.rownames.function = identity, | ||
include.colnames = FALSE | ||
) | ||
} | ||
|
||
cat(paste(readLines("lcp-footer.tex"),collapse="\n")) | ||
sink(NULL) |