From a77993e47ba7beb1adcbceb78b7dfeb6c47d7bbf Mon Sep 17 00:00:00 2001 From: Jon Degenhardt Date: Sun, 13 Oct 2019 11:09:28 -0700 Subject: [PATCH 1/3] tsv-uniq: avoid unnecessary memory allocation. --- tsv-uniq/src/tsv_utils/tsv-uniq.d | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tsv-uniq/src/tsv_utils/tsv-uniq.d b/tsv-uniq/src/tsv_utils/tsv-uniq.d index f5779a18..5c2f1913 100644 --- a/tsv-uniq/src/tsv_utils/tsv-uniq.d +++ b/tsv-uniq/src/tsv_utils/tsv-uniq.d @@ -220,7 +220,7 @@ struct TsvUniqOptions if (max != 0 || (!equivMode && !numberMode)) max = atLeast; } - if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 1-based indexing. + if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 0-based indexing. } catch (Exception exc) @@ -266,9 +266,9 @@ int main(string[] cmdArgs) */ void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) { - import tsv_utils.common.utils : InputFieldReordering, bufferedByLine, BufferedOutputRange; + import tsv_utils.common.utils : InputFieldReordering, bufferedByLine, BufferedOutputRange, joinAppend; import std.algorithm : splitter; - import std.array : join; + import std.array : appender; import std.conv : to; import std.range; import std.uni : toLower; @@ -285,7 +285,10 @@ void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) struct EquivEntry { size_t equivID; size_t count; } EquivEntry[string] equivHash; - size_t numFields = cmdopt.fields.length; + /* Key buffer when using multi-field keys. */ + auto multiFieldKeyBuffer = appender!(char[]); + + const size_t numKeyFields = cmdopt.fields.length; long nextEquivID = cmdopt.equivStartID; bool headerWritten = false; foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) @@ -343,7 +346,16 @@ void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) (filename == "-") ? "Standard Input" : filename, lineNum)); } - key = keyFieldsReordering.outputFields.join(cmdopt.delim); + if (numKeyFields == 1) + { + key = keyFieldsReordering.outputFields[0]; + } + else + { + multiFieldKeyBuffer.clear(); + keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim); + key = multiFieldKeyBuffer.data; + } } if (cmdopt.ignoreCase) key = key.toLower; From 6605d8e38ce7cc9a59c64539cfd723828ccc7443 Mon Sep 17 00:00:00 2001 From: Jon Degenhardt Date: Sun, 13 Oct 2019 11:50:37 -0700 Subject: [PATCH 2/3] tsv-uniq: avoid unnecessary memory allocation for case insensitive compares. --- tsv-uniq/src/tsv_utils/tsv-uniq.d | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tsv-uniq/src/tsv_utils/tsv-uniq.d b/tsv-uniq/src/tsv_utils/tsv-uniq.d index 5c2f1913..601849d0 100644 --- a/tsv-uniq/src/tsv_utils/tsv-uniq.d +++ b/tsv-uniq/src/tsv_utils/tsv-uniq.d @@ -271,7 +271,8 @@ void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) import std.array : appender; import std.conv : to; import std.range; - import std.uni : toLower; + import std.uni : asLowerCase; + import std.utf : byChar; /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields); @@ -285,8 +286,9 @@ void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) struct EquivEntry { size_t equivID; size_t count; } EquivEntry[string] equivHash; - /* Key buffer when using multi-field keys. */ + /* Reusable buffers for multi-field keys and case-insensitive keys. */ auto multiFieldKeyBuffer = appender!(char[]); + auto lowerKeyBuffer = appender!(char[]); const size_t numKeyFields = cmdopt.fields.length; long nextEquivID = cmdopt.equivStartID; @@ -358,7 +360,13 @@ void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) } } - if (cmdopt.ignoreCase) key = key.toLower; + if (cmdopt.ignoreCase) + { + /* Equivalent to key = key.toLower, but without memory allocation. */ + lowerKeyBuffer.clear(); + lowerKeyBuffer.put(key.asLowerCase.byChar); + key = lowerKeyBuffer.data; + } bool isOutput = false; EquivEntry currEntry; From 99bc809bba75c51157699ea82be11811683bc551 Mon Sep 17 00:00:00 2001 From: Jon Degenhardt Date: Sun, 13 Oct 2019 16:14:44 -0700 Subject: [PATCH 3/3] tsv-utils: More unit tests for different key sizes and case sensitivity. --- common/src/tsv_utils/common/utils.d | 7 +- tsv-uniq/tests/gold/basic_tests_1.txt | 128 ++++++++++++++++++++++++++ tsv-uniq/tests/input3.tsv | 21 +++++ tsv-uniq/tests/tests.sh | 13 +++ 4 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 tsv-uniq/tests/input3.tsv diff --git a/common/src/tsv_utils/common/utils.d b/common/src/tsv_utils/common/utils.d index 862c937c..46100c2c 100644 --- a/common/src/tsv_utils/common/utils.d +++ b/common/src/tsv_utils/common/utils.d @@ -1030,9 +1030,6 @@ unittest joinAppend performs a join operation on an input range, appending the results to an output range. -Note: The main uses of joinAppend have been replaced by BufferedOutputRange, which has -its own joinAppend method. - joinAppend was written as a performance enhancement over using std.algorithm.joiner or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower than std.array.join with writeln. The joiner performance may be due to interaction @@ -1046,6 +1043,10 @@ illustrates. It is a modification of the InputFieldReordering example. The role Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange uses a similar technique to buffer multiple lines. +Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has +its own joinAppend method. However, joinAppend remains useful when constructing internal +buffers where BufferedOutputRange is not appropriate. + --- int main(string[] args) { diff --git a/tsv-uniq/tests/gold/basic_tests_1.txt b/tsv-uniq/tests/gold/basic_tests_1.txt index d42b6dec..9481173c 100644 --- a/tsv-uniq/tests/gold/basic_tests_1.txt +++ b/tsv-uniq/tests/gold/basic_tests_1.txt @@ -410,6 +410,134 @@ f1 f2 f3 f4 f5 id 9 ÀBC 1367 1331 18 17 0 Z 5734 602 23 +====Mixed tests=== + +====[tsv-uniq input3.tsv]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +1 GREEN GRÜN 緑 VERDE +2 White Weiß 白い Blanca +3 TEAL BLAUGRÜN ティール AZULADO +4 soccer fútbol サッカー fútbol +5 BASEBALL BASEBALL 野球 BÉISBOL +1 green grün 緑 verde +2 white weiß 白い blanca +3 Teal Blaugrün ティール azulado +4 SOCCER FÚTBOL サッカー FÚTBOL +5 baseball baseball 野球 béisbol +1 green Grün 緑 verde +2 white WEISS 白い Blanca +4 SOCCER FÚTBOL サッカー fútbol +5 baseball BASEBALL 野球 béisbol + +====[tsv-uniq input3.tsv -i]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +2 white WEISS 白い Blanca + +====[tsv-uniq input3.tsv -H -i]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +2 white WEISS 白い Blanca + +====[tsv-uniq input3.tsv -H -f 1]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol + +====[tsv-uniq input3.tsv -H -f 1 -i]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol + +====[tsv-uniq input3.tsv -H -f 2,3]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +1 GREEN GRÜN 緑 VERDE +2 White Weiß 白い Blanca +3 TEAL BLAUGRÜN ティール AZULADO +4 soccer fútbol サッカー fútbol +5 BASEBALL BASEBALL 野球 BÉISBOL +1 green grün 緑 verde +2 white weiß 白い blanca +3 Teal Blaugrün ティール azulado +4 SOCCER FÚTBOL サッカー FÚTBOL +5 baseball baseball 野球 béisbol +1 green Grün 緑 verde +2 white WEISS 白い Blanca +5 baseball BASEBALL 野球 béisbol + +====[tsv-uniq input3.tsv -H -f 2,3 -i]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +2 white WEISS 白い Blanca + +====[tsv-uniq input3.tsv -H -f 2,3 -i]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +2 white WEISS 白い Blanca + +====[tsv-uniq input3.tsv -H -f 2,3,5]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +1 GREEN GRÜN 緑 VERDE +2 White Weiß 白い Blanca +3 TEAL BLAUGRÜN ティール AZULADO +4 soccer fútbol サッカー fútbol +5 BASEBALL BASEBALL 野球 BÉISBOL +1 green grün 緑 verde +2 white weiß 白い blanca +3 Teal Blaugrün ティール azulado +4 SOCCER FÚTBOL サッカー FÚTBOL +5 baseball baseball 野球 béisbol +1 green Grün 緑 verde +2 white WEISS 白い Blanca +4 SOCCER FÚTBOL サッカー fútbol +5 baseball BASEBALL 野球 béisbol + +====[tsv-uniq input3.tsv -H -f 2,3,5 -i]==== +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +2 white WEISS 白い Blanca + ====Max count tests=== ====[tsv-uniq -H --max 0 input1.tsv]==== diff --git a/tsv-uniq/tests/input3.tsv b/tsv-uniq/tests/input3.tsv new file mode 100644 index 00000000..e53b7233 --- /dev/null +++ b/tsv-uniq/tests/input3.tsv @@ -0,0 +1,21 @@ +f1 f2 f3 f4 f5 +1 Green Grün 緑 Verde +2 WHITE WEIẞ 白い BLANCA +3 teal blaugrün ティール azulado +4 Soccer Fútbol サッカー Fútbol +5 Baseball Baseball 野球 Béisbol +1 GREEN GRÜN 緑 VERDE +2 White Weiß 白い Blanca +3 TEAL BLAUGRÜN ティール AZULADO +4 soccer fútbol サッカー fútbol +5 BASEBALL BASEBALL 野球 BÉISBOL +1 green grün 緑 verde +2 white weiß 白い blanca +3 Teal Blaugrün ティール azulado +4 SOCCER FÚTBOL サッカー FÚTBOL +5 baseball baseball 野球 béisbol +1 green Grün 緑 verde +2 white WEISS 白い Blanca +3 Teal Blaugrün ティール azulado +4 SOCCER FÚTBOL サッカー fútbol +5 baseball BASEBALL 野球 béisbol diff --git a/tsv-uniq/tests/tests.sh b/tsv-uniq/tests/tests.sh index 6d22632b..0ebb7797 100755 --- a/tsv-uniq/tests/tests.sh +++ b/tsv-uniq/tests/tests.sh @@ -50,6 +50,19 @@ runtest ${prog} "input1.tsv -H -f 3,4 --equiv --ignore-case" ${basic_tests_1} runtest ${prog} "input1.tsv --header -f 3,4 --equiv --equiv-start 10 --ignore-case" ${basic_tests_1} runtest ${prog} "input1.tsv --header -f 3,4 --equiv --equiv-start 10 --equiv-header id --ignore-case" ${basic_tests_1} +# Additional tests on keys and case sensitivity +echo "" >> ${basic_tests_1}; echo "====Mixed tests===" >> ${basic_tests_1} +runtest ${prog} "input3.tsv" ${basic_tests_1} +runtest ${prog} "input3.tsv -i" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -i" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 1" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 1 -i" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 2,3" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 2,3 -i" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 2,3 -i" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 2,3,5" ${basic_tests_1} +runtest ${prog} "input3.tsv -H -f 2,3,5 -i" ${basic_tests_1} + # Max unique values echo "" >> ${basic_tests_1}; echo "====Max count tests===" >> ${basic_tests_1} runtest ${prog} "-H --max 0 input1.tsv" ${basic_tests_1}