Skip to content

Commit

Permalink
tsv-split named field support. Fix bug in mixed numeric-named field r…
Browse files Browse the repository at this point in the history
…ange detection. (#291)
  • Loading branch information
jondegenhardt authored Jun 14, 2020
1 parent 3751dc4 commit 36aabb3
Show file tree
Hide file tree
Showing 5 changed files with 360 additions and 41 deletions.
67 changes: 66 additions & 1 deletion common/src/tsv_utils/common/fieldlist.d
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,10 @@ private bool isNumericFieldGroupWithHyphenFirstOrLast(const char[] fieldGroup) @
assert(isNumericFieldGroupWithHyphenFirstOrLast(`1-`));
assert(isNumericFieldGroupWithHyphenFirstOrLast(`12-`));
assert(!isNumericFieldGroupWithHyphenFirstOrLast(`-1333-`));
assert(!isNumericFieldGroupWithHyphenFirstOrLast(`\-1`));
assert(!isNumericFieldGroupWithHyphenFirstOrLast(`\-12`));
assert(!isNumericFieldGroupWithHyphenFirstOrLast(`1\-`));
assert(!isNumericFieldGroupWithHyphenFirstOrLast(`12\-`));
}

/**
Expand All @@ -1354,7 +1358,22 @@ private bool isNumericFieldGroupWithHyphenFirstOrLast(const char[] fieldGroup) @
*/
private bool isMixedNumericNamedFieldGroup(const char[] fieldGroup) @safe
{
return cast(bool) fieldGroup.matchFirst(ctRegex!`^((.*[^0-9].*\-[0-9]+)|([0-9]+\-.*[^0-9].*))$`);
/* Patterns cases:
* - Field group starts with a series of digits followed by a hyphen, followed
* sequence containing a non-digit character.
* ^([0-9]+\-.*[^0-9].*)$
* - Field ends with an unescaped hyphen and a series of digits. Two start cases:
* - Non-digit, non-backslash immediately preceding the hyphen
* ^(.*[^0-9\\]\-[0-9]+)$
* - Digit immediately preceding the hyphen, non-hyphen earlier
* ^(.*[^0-9].*[0-9]\-[0-9]+)$
* These two combined:
* ^( ( (.*[^0-9\\]) | (.*[^0-9].*[0-9]) ) \-[0-9]+ )$
*
* All cases combined:
* ^( ([0-9]+\-.*[^0-9].*) | ( (.*[^0-9\\]) | (.*[^0-9].*[0-9]) ) \-[0-9]+)$
*/
return cast(bool) fieldGroup.matchFirst(ctRegex!`^(([0-9]+\-.*[^0-9].*)|((.*[^0-9\\])|(.*[^0-9].*[0-9]))\-[0-9]+)$`);
}

@safe unittest
Expand All @@ -1363,21 +1382,67 @@ private bool isMixedNumericNamedFieldGroup(const char[] fieldGroup) @safe
assert(isMixedNumericNamedFieldGroup(`y-2`));
assert(isMixedNumericNamedFieldGroup(`23-zy`));
assert(isMixedNumericNamedFieldGroup(`pB-37`));

assert(isMixedNumericNamedFieldGroup(`5x-0`));
assert(isMixedNumericNamedFieldGroup(`x5-9`));
assert(isMixedNumericNamedFieldGroup(`0-2m`));
assert(isMixedNumericNamedFieldGroup(`9-m2`));
assert(isMixedNumericNamedFieldGroup(`5x-37`));
assert(isMixedNumericNamedFieldGroup(`x5-37`));
assert(isMixedNumericNamedFieldGroup(`37-2m`));
assert(isMixedNumericNamedFieldGroup(`37-m2`));

assert(isMixedNumericNamedFieldGroup(`18-23t`));
assert(isMixedNumericNamedFieldGroup(`x12-632`));
assert(isMixedNumericNamedFieldGroup(`15-15.5`));

assert(isMixedNumericNamedFieldGroup(`1-g\-h`));
assert(isMixedNumericNamedFieldGroup(`z\-y-2`));
assert(isMixedNumericNamedFieldGroup(`23-zy\-st`));
assert(isMixedNumericNamedFieldGroup(`ts\-pB-37`));

assert(!isMixedNumericNamedFieldGroup(`a-c`));
assert(!isMixedNumericNamedFieldGroup(`1-3`));
assert(!isMixedNumericNamedFieldGroup(`\1-g`));
assert(!isMixedNumericNamedFieldGroup(`-g`));
assert(!isMixedNumericNamedFieldGroup(`h-`));
assert(!isMixedNumericNamedFieldGroup(`-`));
assert(!isMixedNumericNamedFieldGroup(``));
assert(!isMixedNumericNamedFieldGroup(`\2-\3`));
assert(!isMixedNumericNamedFieldGroup(`\10-\20`));
assert(!isMixedNumericNamedFieldGroup(`x`));
assert(!isMixedNumericNamedFieldGroup(`xyz`));
assert(!isMixedNumericNamedFieldGroup(`0`));
assert(!isMixedNumericNamedFieldGroup(`9`));

assert(!isMixedNumericNamedFieldGroup(`1\-g`));
assert(!isMixedNumericNamedFieldGroup(`y\-2`));
assert(!isMixedNumericNamedFieldGroup(`23\-zy`));
assert(!isMixedNumericNamedFieldGroup(`pB\-37`));
assert(!isMixedNumericNamedFieldGroup(`18\-23t`));
assert(!isMixedNumericNamedFieldGroup(`x12\-632`));

assert(!isMixedNumericNamedFieldGroup(`5x\-0`));
assert(!isMixedNumericNamedFieldGroup(`x5\-9`));
assert(!isMixedNumericNamedFieldGroup(`0\-2m`));
assert(!isMixedNumericNamedFieldGroup(`9\-m2`));
assert(!isMixedNumericNamedFieldGroup(`5x\-37`));
assert(!isMixedNumericNamedFieldGroup(`x5\-37`));
assert(!isMixedNumericNamedFieldGroup(`37\-2m`));
assert(!isMixedNumericNamedFieldGroup(`37\-m2`));

assert(!isMixedNumericNamedFieldGroup(`1\-g\-h`));
assert(!isMixedNumericNamedFieldGroup(`z\-y\-2`));
assert(!isMixedNumericNamedFieldGroup(`23\-zy\-st`));
assert(!isMixedNumericNamedFieldGroup(`ts\-pB\-37`));

assert(!isMixedNumericNamedFieldGroup(`\-g`));
assert(!isMixedNumericNamedFieldGroup(`h\-`));
assert(!isMixedNumericNamedFieldGroup(`i\-j`));
assert(!isMixedNumericNamedFieldGroup(`\-2`));
assert(!isMixedNumericNamedFieldGroup(`2\-`));
assert(!isMixedNumericNamedFieldGroup(`2\-3`));
assert(!isMixedNumericNamedFieldGroup(`\2\-\3`));
}

/**
Expand Down
108 changes: 69 additions & 39 deletions tsv-split/src/tsv_utils/tsv-split.d
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,11 @@ struct TsvSplitOptions

string programName; /// Program name
InputSourceRange inputSources; /// Input files
bool helpVerbose = false; /// --help-verbose
bool headerInOut = false; /// --H|header
bool headerIn = false; /// --I|header-in-only
size_t linesPerFile = 0; /// --l|lines-per-file
uint numFiles = 0; /// --n|num-files
size_t[] keyFields; /// --k|key-fields
size_t[] keyFields; /// Derived: --k|key-fields
string dir; /// --dir
string prefix = "part_"; /// --prefix
string suffix = invalidFileSuffix; /// --suffix
Expand All @@ -238,7 +237,6 @@ struct TsvSplitOptions
uint seedValueOptionArg = 0; /// --v|seed-value
char delim = '\t'; /// --d|delimiter
uint maxOpenFilesArg = 0; /// --max-open-files
bool versionWanted = false; /// --V|version
bool hasHeader = false; /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
bool keyIsFullLine = false; /// Derived. True if '--f|fields 0' is specfied.
bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value
Expand All @@ -265,12 +263,19 @@ struct TsvSplitOptions
auto processArgs(ref string[] cmdArgs)
{
import std.algorithm : all, canFind, each, min;
import std.conv : to;
import std.file : exists, isDir;
import std.getopt;
import std.math : isNaN;
import std.path : baseName, expandTilde, extension, stripExtension;
import std.typecons : Yes, No;
import tsv_utils.common.fieldlist : makeFieldListOptionHandler;
import tsv_utils.common.fieldlist;

bool helpVerbose = false; // --help-verbose
bool versionWanted = false; // --V|version
string keyFieldsArg; // --k|key-fields

string keyFieldsOptionString = "k|key-fields";

programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";

Expand All @@ -288,8 +293,13 @@ struct TsvSplitOptions

"l|lines-per-file", "NUM Number of lines to write to each output file (excluding the header line).", &linesPerFile,
"n|num-files", "NUM Number of output files to generate.", &numFiles,
"k|key-fields", "<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),

//"k|key-fields", "<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
//keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),

keyFieldsOptionString,
"<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
&keyFieldsArg,

"dir", "STR Directory to write to. Default: Current working directory.", &dir,
"prefix", "STR Filename prefix. Default: 'part_'", &prefix,
Expand Down Expand Up @@ -328,41 +338,34 @@ struct TsvSplitOptions
return tuple(false, 0);
}

/*
* Validation and derivations.
/* Remaining command line args are files.
*/
string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
cmdArgs.length = 1;

/* Validation and derivations - Do as much validation prior to header line
* processing as possible (avoids waiting on stdin).
*
* Note: keyFields depends on header line processing, but keyFieldsArg
* can be used to detect whether the command line argument was specified.
*/

enforce(!(headerInOut && headerIn),
"Use only one of '--H|header' and '--I|header-in-only'.");

hasHeader = headerInOut || headerIn;

enforce(linesPerFile != 0 || numFiles != 0,
"Either '--l|lines-per-file' or '--n|num-files' is required.");

enforce(linesPerFile == 0 || numFiles == 0,
"'--l|lines-per-file' and '--n|num-files' cannot be used together.");

enforce(linesPerFile == 0 || keyFields.length == 0,
enforce(linesPerFile == 0 || keyFieldsArg.length == 0,
"'--l|lines-per-file' and '--k|key-fields' cannot be used together.");

enforce(numFiles != 1, "'--n|num-files must be two or more.");

if (keyFields.length > 0)
{
if (keyFields.length == 1 && keyFields[0] == 0)
{
keyIsFullLine = true;
}
else
{
enforce(keyFields.all!(x => x != 0),
"Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");

keyFields.each!((ref x) => --x); // Convert to zero-based indexing.
}
}

enforce(!(headerInOut && headerIn),
"Use only one of '--H|header' and '--I|header-in-only'.");

hasHeader = headerInOut || headerIn;

if (!dir.empty)
{
dir = dir.expandTilde;
Expand Down Expand Up @@ -395,7 +398,6 @@ struct TsvSplitOptions
immutable uint numReservedOpenFiles = 4;
immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();


enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles,
format("'--max-open-files' must be at least %d.",
numReservedOpenFiles + 1));
Expand Down Expand Up @@ -423,13 +425,6 @@ struct TsvSplitOptions

maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;

/* Remaining command line args are files.
*/
string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
cmdArgs.length = 1;
ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
inputSources = inputSourceRange(filepaths, readHeader);

/* Suffix - If not provided, use the extension of the first input file.
* No suffix if reading from standard input.
*/
Expand Down Expand Up @@ -472,6 +467,41 @@ struct TsvSplitOptions
}
}
assert(digitWidth != 0);

/*
* Create the inputSourceRange and perform header line processing.
*/
ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
inputSources = inputSourceRange(filepaths, readHeader);

string[] headerFields;

if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]);

if (!keyFieldsArg.empty)
{
keyFields =
keyFieldsArg
.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
(hasHeader, headerFields, keyFieldsOptionString)
.array;
}

if (keyFields.length > 0)
{
if (keyFields.length == 1 && keyFields[0] == 0)
{
keyIsFullLine = true;
}
else
{
enforce(keyFields.all!(x => x != 0),
"Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");

keyFields.each!((ref x) => --x); // Convert to zero-based indexing.
}
}

}
catch (Exception exc)
{
Expand All @@ -495,7 +525,7 @@ unittest
import std.path : buildPath;

/* A dummy file is used so we don't have to worry about the cases where command
* line processing might open a file. Don't want to use stanard input for this,
* line processing might open a file. Don't want to use standard input for this,
* at least in cases where it might try to read to get the header line.
*/
auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
Expand Down Expand Up @@ -577,7 +607,7 @@ unittest
}

/* In these tests, don't use headers and when files are listed, use 'somefile_txt' first.
* This make sure there is no attempt to read standard input and that there won't be an
* This makes sure there is no attempt to read standard input and that there won't be an
* open failure trying to find a file.
*/
testSuffix(["unittest", "-n", "2"], "");
Expand Down
11 changes: 10 additions & 1 deletion tsv-split/tests/gold/error_tests_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ input1x3.txt: line 3
[tsv-split] Error processing command line arguments: '--n|num-files must be two or more.

====[tsv-split -n 2 -k 1.5 input4x58.tsv]====
[tsv-split] Error processing command line arguments: [--k|key-fields] Unexpected '.' when converting from type string to type ulong
[tsv-split] Error processing command line arguments: [--k|key-fields] Non-numeric field group: '1.5'. Use '--H|header' when using named field groups.

====[tsv-split -n 2 -k 0,1 input4x58.tsv]====
[tsv-split] Error processing command line arguments: Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.
Expand Down Expand Up @@ -96,6 +96,15 @@ Error [tsv-split]: Not enough fields in line. File: input4x58.tsv, Line: 1
====[tsv-split -n 2 --suffix ab/cd input4x58.tsv]====
[tsv-split] Error processing command line arguments: '--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.

====[tsv-split -n 2 --key-fields c\-1 input4x58.tsv]====
[tsv-split] Error processing command line arguments: [--k|key-fields] Non-numeric field group: 'c\-1'. Use '--H|header' when using named field groups.

====[tsv-split -n 2 -H --key-fields c-1 input4x58.tsv]====
[tsv-split] Error processing command line arguments: [--k|key-fields] Ranges with both numeric and named components are not supported: 'c-1'.

====[tsv-split -n 2 -I --key-fields c-1 input4x58.tsv]====
[tsv-split] Error processing command line arguments: [--k|key-fields] Ranges with both numeric and named components are not supported: 'c-1'.

====[ulimit -Sn 5 && tsv-split -s -n 101 -k 3 --max-open-files 6 ../input4x58.tsv]====
[tsv-split] Error processing command line arguments: '--max-open-files' value (6) greater current system limit (5).
Run 'ulimit -n' to see the soft limit.
Expand Down
Loading

0 comments on commit 36aabb3

Please sign in to comment.