Skip to content

Commit

Permalink
Line buffering updates. General fixes, tsv-select support. (#334)
Browse files Browse the repository at this point in the history
  • Loading branch information
jondegenhardt authored Mar 4, 2021
1 parent 27935f0 commit 9d4b110
Show file tree
Hide file tree
Showing 12 changed files with 123 additions and 58 deletions.
4 changes: 2 additions & 2 deletions bash_completion/tsv-utils
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ _tsv_filter()
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
prev="${COMP_WORDS[COMP_CWORD-1]}"
opts="--help --help-verbose --help-fields --help-options --version --header --or --invert --count --delimiter --empty --not-empty --blank --not-blank --is-numeric --is-finite --is-nan --is-infinity --le --lt --ge --gt --eq --ne --str-le --str-lt --str-ge --str-gt --str-eq --istr-eq --str-ne --istr-ne --str-in-fld --istr-in-fld --str-not-in-fld --istr-not-in-fld --regex --iregex --not-regex --not-iregex --char-len-le --char-len-lt --char-len-ge --char-len-gt --char-len-eq --char-len-ne --byte-len-le --byte-len-lt --byte-len-ge --byte-len-gt --byte-len-eq --byte-len-ne --ff-le --ff-lt --ff-ge --ff-gt --ff-eq --ff-ne --ff-str-eq --ff-istr-eq --ff-str-ne --ff-istr-ne --ff-absdiff-le --ff-absdiff-gt ff-reldiff-le --ff-reldiff-gt"
opts="--help --help-verbose --help-fields --help-options --version --header --or --invert --count --delimiter --line-buffered --empty --not-empty --blank --not-blank --is-numeric --is-finite --is-nan --is-infinity --le --lt --ge --gt --eq --ne --str-le --str-lt --str-ge --str-gt --str-eq --istr-eq --str-ne --istr-ne --str-in-fld --istr-in-fld --str-not-in-fld --istr-not-in-fld --regex --iregex --not-regex --not-iregex --char-len-le --char-len-lt --char-len-ge --char-len-gt --char-len-eq --char-len-ne --byte-len-le --byte-len-lt --byte-len-ge --byte-len-gt --byte-len-eq --byte-len-ne --ff-le --ff-lt --ff-ge --ff-gt --ff-eq --ff-ne --ff-str-eq --ff-istr-eq --ff-str-ne --ff-istr-ne --ff-absdiff-le --ff-absdiff-gt ff-reldiff-le --ff-reldiff-gt"

# Options requiring an argument or precluding other options
case $prev in
Expand Down Expand Up @@ -206,7 +206,7 @@ _tsv_select()
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
prev="${COMP_WORDS[COMP_CWORD-1]}"
opts="--help --help-verbose --help-fields --version --header --fields --exclude --rest --delimiter"
opts="--help --help-verbose --help-fields --version --header --fields --exclude --rest --delimiter --line-buffered"

# Options requiring an argument or precluding other options
# Options with a restricted set of arguments (ie. -r|--rest) have their own case clause.
Expand Down
99 changes: 76 additions & 23 deletions common/src/tsv_utils/common/utils.d
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,9 @@ constructor when mixing specific setting with defaults.
*/
enum BufferedOutputRangeDefaults
{
reserveSize = 11264,
flushSize = 10240,
lineBufferedFlushSize = 1,
reserveSize = 11264,
maxSize = 4194304
}

Expand All @@ -416,8 +417,12 @@ lines, as it blocks many writes together in a single write.
The internal buffer is written to the output stream after flushSize has been reached.
This is checked at newline boundaries, when appendln is called or when put is called
with a single newline character. Other writes check maxSize, which is used to avoid
runaway buffers. An implication is that line buffering can be achieved on by specifying
flushsize as 1.
runaway buffers.
This scheme only flushes the internal buffer, it does not flush the output stream.
Use flush() to flush both the internal buffer and the output stream. Specify flushSize
as BufferedOutputRangeDefaults.lineBufferedFlushSize in the constructor to get line
buffering with immediate flushes to the output stream.
BufferedOutputRange has a put method allowing it to be used a range. It has a number
of other methods providing additional control.
Expand All @@ -437,13 +442,12 @@ $(LIST
* `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`.
For reasons that are not clear, joiner is quite slow.
* `flushIfFull()` - Flush the internal buffer to the output stream if flushSize has been
reached.
* `flush()` - Write the internal buffer to the output stream.
* `flush()` - Write the internal buffer to the output stream and flush the output stream.
* `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single
newline character, '\n' or "\n".
* `flushBuffer()` - This flushes both the internal buffers and the output stream.
)
The internal buffer is automatically flushed when the BufferedOutputRange goes out of
Expand Down Expand Up @@ -489,25 +493,40 @@ if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, ch
flush();
}

void flush()
private void flushBuffer()
{
static if (isFileHandle!OutputTarget) _outputTarget.rawWrite(_outputBuffer.data);
static if (isFileHandle!OutputTarget)
{
_outputTarget.rawWrite(_outputBuffer.data);

if (_flushSize == BufferedOutputRangeDefaults.lineBufferedFlushSize)
{
_outputTarget.flush();
}
}
else _outputTarget.put(_outputBuffer.data);

_outputBuffer.clear;
}

bool flushIfFull()
void flush()
{
flushBuffer();
static if (isFileHandle!OutputTarget) _outputTarget.flush();
}

/* flushIfFull flushes the internal buffer if flushSize has been reached. */
private bool flushIfFull()
{
bool isFull = _outputBuffer.data.length >= _flushSize;
if (isFull) flush();
if (isFull) flushBuffer();
return isFull;
}

/* flushIfMaxSize is a safety check to avoid runaway buffer growth. */
void flushIfMaxSize()
private void flushIfMaxSize()
{
if (_outputBuffer.data.length >= _maxSize) flush();
if (_outputBuffer.data.length >= _maxSize) flushBuffer();
}

/* maybeFlush is intended for the case where put is called with a trailing newline.
Expand All @@ -525,7 +544,6 @@ if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, ch
return doFlush;
}


private void appendRaw(T)(T stuff) pure
{
import std.range : rangePut = put;
Expand Down Expand Up @@ -869,17 +887,17 @@ if (is(Char == char) || is(Char == ubyte))
* - _lineEnd - End of current line.
*/
private File _file;
private immutable LineBuffered _lineBuffered;
private ubyte[] _buffer;
private size_t _lineStart = 0;
private size_t _lineEnd = 0;
private size_t _dataEnd = 0;
private LineBuffered _lineBuffered;

this (File f, LineBuffered lineBuffered)
{
_file = f;
_buffer = new ubyte[readSize + growSize];
_lineBuffered = lineBuffered;
_buffer = new ubyte[readSize + growSize];
}

bool empty() const pure
Expand Down Expand Up @@ -2036,10 +2054,10 @@ byLineSourceRange is a helper function for creating new byLineSourceRange object
*/
auto byLineSourceRange(
KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n')
(string[] filepaths)
(string[] filepaths, LineBuffered lineBuffered = No.lineBuffered)
if (is(Char == char) || is(Char == ubyte))
{
return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths);
return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths, lineBuffered);
}

/**
Expand Down Expand Up @@ -2077,16 +2095,18 @@ if (is(Char == char) || is(Char == ubyte))
alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator);

private string[] _filepaths;
private immutable LineBuffered _lineBuffered;
private ByLineSourceType _front;

this(string[] filepaths)
this(string[] filepaths, LineBuffered lineBuffered = No.lineBuffered)
{
_filepaths = filepaths.dup;
_lineBuffered = lineBuffered;
_front = null;

if (!_filepaths.empty)
{
_front = new ByLineSourceType(_filepaths.front);
_front = new ByLineSourceType(_filepaths.front, _lineBuffered);
_front.open;
_filepaths.popFront;
}
Expand Down Expand Up @@ -2116,7 +2136,7 @@ if (is(Char == char) || is(Char == ubyte))

if (!_filepaths.empty)
{
_front = new ByLineSourceType(_filepaths.front);
_front = new ByLineSourceType(_filepaths.front, _lineBuffered);
_front.open;
_filepaths.popFront;
}
Expand Down Expand Up @@ -2162,15 +2182,17 @@ if (is(Char == char) || is(Char == ubyte))
alias ByLineType = ReturnType!newByLineFn;

private immutable string _filepath;
private immutable LineBuffered _lineBuffered;
private immutable bool _isStdin;
private bool _isOpen;
private bool _hasBeenOpened;
private File _file;
private ByLineType _byLineRange;

private this(string filepath) pure nothrow @safe
private this(string filepath, LineBuffered lineBuffered = No.lineBuffered) pure nothrow @safe
{
_filepath = filepath;
_lineBuffered = lineBuffered;
_isStdin = filepath == "-";
_isOpen = false;
_hasBeenOpened = false;
Expand Down Expand Up @@ -2229,7 +2251,7 @@ if (is(Char == char) || is(Char == ubyte))
assert(!_hasBeenOpened);

_file = isStdin ? stdin : _filepath.File("rb");
_byLineRange = newByLineFn(_file);
_byLineRange = newByLineFn(_file, _lineBuffered);
_isOpen = true;
_hasBeenOpened = true;
}
Expand Down Expand Up @@ -2363,6 +2385,37 @@ unittest

/* The ByLineSourceRange is a reference range, consumed by the foreach. */
assert(inputSourcesYesTerminator.empty);

/* Using Yes.keepTerminator, Yes.lineBuffered. */
readSourcesYesTerminator.clear;
auto inputSourcesYesTerminatorYesLineBuffered =
byLineSourceRange!(Yes.keepTerminator)(inputFiles[0 .. numFiles], Yes.lineBuffered);
assert(inputSourcesYesTerminatorYesLineBuffered.length == numFiles);

foreach(fileNum, source; inputSourcesYesTerminatorYesLineBuffered.enumerate)
{
readSourcesYesTerminator.put(source);
assert(source.isOpen);
assert(source._file.isOpen);
assert(readSourcesYesTerminator.data[0 .. fileNum].all!(s => !s.isOpen));
assert(readSourcesYesTerminator.data[fileNum].isOpen);

assert(source.byLine.empty || source.byLine.front == fileHeaders[fileNum]);

assert(source.name == inputFiles[fileNum]);
assert(!source.isStdin);

auto readFileData = appender!(char[]);
foreach(line; source.byLine)
{
readFileData.put(line);
}

assert(readFileData.data == fileData[fileNum]);
}

/* The ByLineSourceRange is a reference range, consumed by the foreach. */
assert(inputSourcesYesTerminatorYesLineBuffered.empty);
}

/* Empty filelist. */
Expand Down
2 changes: 1 addition & 1 deletion csv2tsv/src/tsv_utils/csv2tsv.d
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ else
catch (Exception exc)
{
writeln();
stdin.flush();
stdout.flush();
stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
return 1;
}
Expand Down
6 changes: 3 additions & 3 deletions csv2tsv/tests/gold/error_tests_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Error test set 1
----------------

====[csv2tsv nosuchfile.txt]====
Error [csv2tsv]: Cannot open file `nosuchfile.txt' in mode `rb' (No such file or directory)

Error [csv2tsv]: Cannot open file `nosuchfile.txt' in mode `rb' (No such file or directory)

====[csv2tsv --nosuchparam input1.txt]====
[csv2tsv] Error processing command line arguments: Unrecognized option --nosuchparam
Expand Down Expand Up @@ -60,14 +60,14 @@ Error [csv2tsv]: Cannot open file `nosuchfile.txt' in mode `rb' (No such file or
[csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement).

====[csv2tsv invalid1.csv]====
Error [csv2tsv]: Invalid CSV. Improperly terminated quoted field. File: invalid1.csv, Line: 3
field1 field2 field3
100 ab c de f
200 gh i,
Error [csv2tsv]: Invalid CSV. Improperly terminated quoted field. File: invalid1.csv, Line: 3

====[csv2tsv invalid2.csv]====
Error [csv2tsv]: Invalid CSV. Improperly terminated quoted field. File: invalid2.csv, Line: 4
field1 field2 field3
100 ab c de f
200 gh i jk l
300 mn o pq r
Error [csv2tsv]: Invalid CSV. Improperly terminated quoted field. File: invalid2.csv, Line: 4
6 changes: 4 additions & 2 deletions tsv-filter/src/tsv_utils/tsv-filter.d
Original file line number Diff line number Diff line change
Expand Up @@ -1013,8 +1013,9 @@ void tsvFilter(ref TsvFilterOptions cmdopt)
/* BufferedOutputRange improves performance on narrow files with high percentages of
* writes.
*/
immutable size_t flushSize =
cmdopt.lineBuffered ? 1 : BufferedOutputRangeDefaults.reserveSize;
immutable size_t flushSize = cmdopt.lineBuffered ?
BufferedOutputRangeDefaults.lineBufferedFlushSize :
BufferedOutputRangeDefaults.flushSize;
auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout, flushSize);
size_t matchedLines = 0;

Expand Down Expand Up @@ -1086,6 +1087,7 @@ void tsvFilter(ref TsvFilterOptions cmdopt)
}
catch (Exception e)
{
bufferedOutput.flush;
throw new Exception(
format("Could not process line or field: %s\n File: %s Line: %s%s",
e.msg, inputStream.name, lineNum,
Expand Down
8 changes: 4 additions & 4 deletions tsv-filter/tests/gold/error_tests_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ Error test set 1
Expected: '--eq <field>:<val>' or '--eq <field-list>:<val> where <val> is a number.

====[tsv-filter --header --le 1000:10 input1.tsv]====
Error [tsv-filter]: Not enough fields in line. File: input1.tsv, Line: 2
F1 F2 F3 F4
Error [tsv-filter]: Not enough fields in line. File: input1.tsv, Line: 2

====[tsv-filter --header --le 1: input1.tsv]====
[tsv-filter] Error processing command line arguments: Invalid option: [--le 1:]. No value after field list.
Expand Down Expand Up @@ -296,12 +296,12 @@ Error [tsv-filter]: Windows/DOS line ending found. Convert file to Unix newlines
File: input1_dos.tsv, Line: 1

====[tsv-filter --header --eq 2:1 input1.tsv input1_dos.tsv]====
Error [tsv-filter]: Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').
File: input1_dos.tsv, Line: 1
F1 F2 F3 F4
1 1.0 a A
Error [tsv-filter]: Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').
File: input1_dos.tsv, Line: 1

====[tsv-filter --str-eq 4:ABC input1.tsv input1_dos.tsv]====
10 10.1 abc ABC
Error [tsv-filter]: Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').
File: input1_dos.tsv, Line: 1
10 10.1 abc ABC
2 changes: 1 addition & 1 deletion tsv-join/tests/gold/error_tests_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ Error [tsv-join]: Not enough fields in line. File: input1.tsv, Line: 1
Error [tsv-join]: Not enough fields in line. File: input1.tsv, Line: 1

====[tsv-join --header -f input1.tsv -k 4 -d 6 input2.tsv]====
Error [tsv-join]: Not enough fields in line. File: input2.tsv, Line: 2
f1 f2 f3 f4 f5
Error [tsv-join]: Not enough fields in line. File: input2.tsv, Line: 2

====[tsv-join -f input1_noheader.tsv -k 6 input2_noheader.tsv]====
Error [tsv-join]: Not enough fields in line. File: input1_noheader.tsv, Line: 1
Expand Down
4 changes: 2 additions & 2 deletions tsv-sample/tests/gold/error_tests_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ Error test set 1
[tsv-sample] Error processing command line arguments: Invalid UTF-8 sequence (at index 1)

====[tsv-sample -H -w 11 input3x25.tsv]====
line title weight
Error [tsv-sample]: Could not process line: Not enough fields on line. Number required: 11; Number found: 3
File: input3x25.tsv Line: 2
line title weight

====[tsv-sample -H -w 0 input3x25.tsv]====
[tsv-sample] Error processing command line arguments: [--w|weight-field] Field numbers must be greater than zero: '0'.
Expand Down Expand Up @@ -70,8 +70,8 @@ line title weight
Error [tsv-sample]: Not enough fields in line. File: input4x50.tsv, Line: 1

====[tsv-sample -H -p 0.5 -k 5 input4x50.tsv input4x15.tsv]====
Error [tsv-sample]: Not enough fields in line. File: input4x50.tsv, Line: 2
c-1 c-2 c-3 c-4
Error [tsv-sample]: Not enough fields in line. File: input4x50.tsv, Line: 2

====[tsv-sample -H -p 0.5 -k no_such_field input4x50.tsv input4x15.tsv]====
[tsv-sample] Error processing command line arguments: [--k|key-fields] Field not found in file header: 'no_such_field'.
Expand Down
2 changes: 1 addition & 1 deletion tsv-sample/tests/gold/error_tests_2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ Error test set 2
----------------

====[tsv-sample -H -w 2 input3x25.tsv]====
line title weight
Error [tsv-sample]: Could not process line: no digits seen for input "Белые ночи".
File: input3x25.tsv Line: 2
line title weight

====[tsv-sample -w 3 input3x25.tsv]====
Error [tsv-sample]: Could not process line: no digits seen for input "weight".
Expand Down
Loading

0 comments on commit 9d4b110

Please sign in to comment.