tsv-split named field support. Fix bug in mixed numeric-named field r…

…ange detection. (#291)
eBay · Jun 14, 2020 · 36aabb3 · 36aabb3
1 parent 3751dc4
commit 36aabb3
Show file tree

Hide file tree

Showing 5 changed files with 360 additions and 41 deletions.
diff --git a/common/src/tsv_utils/common/fieldlist.d b/common/src/tsv_utils/common/fieldlist.d
@@ -1340,6 +1340,10 @@ private bool isNumericFieldGroupWithHyphenFirstOrLast(const char[] fieldGroup) @
     assert(isNumericFieldGroupWithHyphenFirstOrLast(`1-`));
     assert(isNumericFieldGroupWithHyphenFirstOrLast(`12-`));
     assert(!isNumericFieldGroupWithHyphenFirstOrLast(`-1333-`));
+    assert(!isNumericFieldGroupWithHyphenFirstOrLast(`\-1`));
+    assert(!isNumericFieldGroupWithHyphenFirstOrLast(`\-12`));
+    assert(!isNumericFieldGroupWithHyphenFirstOrLast(`1\-`));
+    assert(!isNumericFieldGroupWithHyphenFirstOrLast(`12\-`));
 }
 
 /**
@@ -1354,7 +1358,22 @@ private bool isNumericFieldGroupWithHyphenFirstOrLast(const char[] fieldGroup) @
  */
 private bool isMixedNumericNamedFieldGroup(const char[] fieldGroup) @safe
 {
-    return cast(bool) fieldGroup.matchFirst(ctRegex!`^((.*[^0-9].*\-[0-9]+)|([0-9]+\-.*[^0-9].*))$`);
+    /* Patterns cases:
+     * - Field group starts with a series of digits followed by a hyphen, followed
+     *   sequence containing a non-digit character.
+     *      ^([0-9]+\-.*[^0-9].*)$
+     * - Field ends with an unescaped hyphen and a series of digits. Two start cases:
+     *   - Non-digit, non-backslash immediately preceding the hyphen
+     *     ^(.*[^0-9\\]\-[0-9]+)$
+     *   - Digit immediately preceding the hyphen, non-hyphen earlier
+     *     ^(.*[^0-9].*[0-9]\-[0-9]+)$
+     *   These two combined:
+     *     ^( ( (.*[^0-9\\]) | (.*[^0-9].*[0-9]) ) \-[0-9]+ )$
+     *
+     * All cases combined:
+     *   ^( ([0-9]+\-.*[^0-9].*) | ( (.*[^0-9\\]) | (.*[^0-9].*[0-9]) ) \-[0-9]+)$
+     */
+    return cast(bool) fieldGroup.matchFirst(ctRegex!`^(([0-9]+\-.*[^0-9].*)|((.*[^0-9\\])|(.*[^0-9].*[0-9]))\-[0-9]+)$`);
 }
 
 @safe unittest
@@ -1363,21 +1382,67 @@ private bool isMixedNumericNamedFieldGroup(const char[] fieldGroup) @safe
     assert(isMixedNumericNamedFieldGroup(`y-2`));
     assert(isMixedNumericNamedFieldGroup(`23-zy`));
     assert(isMixedNumericNamedFieldGroup(`pB-37`));
+
+    assert(isMixedNumericNamedFieldGroup(`5x-0`));
+    assert(isMixedNumericNamedFieldGroup(`x5-9`));
+    assert(isMixedNumericNamedFieldGroup(`0-2m`));
+    assert(isMixedNumericNamedFieldGroup(`9-m2`));
+    assert(isMixedNumericNamedFieldGroup(`5x-37`));
+    assert(isMixedNumericNamedFieldGroup(`x5-37`));
+    assert(isMixedNumericNamedFieldGroup(`37-2m`));
+    assert(isMixedNumericNamedFieldGroup(`37-m2`));
+
     assert(isMixedNumericNamedFieldGroup(`18-23t`));
     assert(isMixedNumericNamedFieldGroup(`x12-632`));
     assert(isMixedNumericNamedFieldGroup(`15-15.5`));
+
+    assert(isMixedNumericNamedFieldGroup(`1-g\-h`));
+    assert(isMixedNumericNamedFieldGroup(`z\-y-2`));
+    assert(isMixedNumericNamedFieldGroup(`23-zy\-st`));
+    assert(isMixedNumericNamedFieldGroup(`ts\-pB-37`));
+
     assert(!isMixedNumericNamedFieldGroup(`a-c`));
     assert(!isMixedNumericNamedFieldGroup(`1-3`));
     assert(!isMixedNumericNamedFieldGroup(`\1-g`));
     assert(!isMixedNumericNamedFieldGroup(`-g`));
     assert(!isMixedNumericNamedFieldGroup(`h-`));
     assert(!isMixedNumericNamedFieldGroup(`-`));
     assert(!isMixedNumericNamedFieldGroup(``));
+    assert(!isMixedNumericNamedFieldGroup(`\2-\3`));
     assert(!isMixedNumericNamedFieldGroup(`\10-\20`));
     assert(!isMixedNumericNamedFieldGroup(`x`));
     assert(!isMixedNumericNamedFieldGroup(`xyz`));
     assert(!isMixedNumericNamedFieldGroup(`0`));
     assert(!isMixedNumericNamedFieldGroup(`9`));
+
+    assert(!isMixedNumericNamedFieldGroup(`1\-g`));
+    assert(!isMixedNumericNamedFieldGroup(`y\-2`));
+    assert(!isMixedNumericNamedFieldGroup(`23\-zy`));
+    assert(!isMixedNumericNamedFieldGroup(`pB\-37`));
+    assert(!isMixedNumericNamedFieldGroup(`18\-23t`));
+    assert(!isMixedNumericNamedFieldGroup(`x12\-632`));
+
+    assert(!isMixedNumericNamedFieldGroup(`5x\-0`));
+    assert(!isMixedNumericNamedFieldGroup(`x5\-9`));
+    assert(!isMixedNumericNamedFieldGroup(`0\-2m`));
+    assert(!isMixedNumericNamedFieldGroup(`9\-m2`));
+    assert(!isMixedNumericNamedFieldGroup(`5x\-37`));
+    assert(!isMixedNumericNamedFieldGroup(`x5\-37`));
+    assert(!isMixedNumericNamedFieldGroup(`37\-2m`));
+    assert(!isMixedNumericNamedFieldGroup(`37\-m2`));
+
+    assert(!isMixedNumericNamedFieldGroup(`1\-g\-h`));
+    assert(!isMixedNumericNamedFieldGroup(`z\-y\-2`));
+    assert(!isMixedNumericNamedFieldGroup(`23\-zy\-st`));
+    assert(!isMixedNumericNamedFieldGroup(`ts\-pB\-37`));
+
+    assert(!isMixedNumericNamedFieldGroup(`\-g`));
+    assert(!isMixedNumericNamedFieldGroup(`h\-`));
+    assert(!isMixedNumericNamedFieldGroup(`i\-j`));
+    assert(!isMixedNumericNamedFieldGroup(`\-2`));
+    assert(!isMixedNumericNamedFieldGroup(`2\-`));
+    assert(!isMixedNumericNamedFieldGroup(`2\-3`));
+    assert(!isMixedNumericNamedFieldGroup(`\2\-\3`));
 }
 
 /**

diff --git a/tsv-split/src/tsv_utils/tsv-split.d b/tsv-split/src/tsv_utils/tsv-split.d
@@ -223,12 +223,11 @@ struct TsvSplitOptions
 
     string programName;                        /// Program name
     InputSourceRange inputSources;             /// Input files
-    bool helpVerbose = false;                  /// --help-verbose
     bool headerInOut = false;                  /// --H|header
     bool headerIn = false;                     /// --I|header-in-only
     size_t linesPerFile = 0;                   /// --l|lines-per-file
     uint numFiles = 0;                         /// --n|num-files
-    size_t[] keyFields;                        /// --k|key-fields
+    size_t[] keyFields;                        /// Derived: --k|key-fields
     string dir;                                /// --dir
     string prefix = "part_";                   /// --prefix
     string suffix = invalidFileSuffix;         /// --suffix
@@ -238,7 +237,6 @@ struct TsvSplitOptions
     uint seedValueOptionArg = 0;               /// --v|seed-value
     char delim = '\t';                         /// --d|delimiter
     uint maxOpenFilesArg = 0;                  /// --max-open-files
-    bool versionWanted = false;                /// --V|version
     bool hasHeader = false;                    /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
     bool keyIsFullLine = false;                /// Derived. True if '--f|fields 0' is specfied.
     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
@@ -265,12 +263,19 @@ struct TsvSplitOptions
     auto processArgs(ref string[] cmdArgs)
     {
         import std.algorithm : all, canFind, each, min;
+        import std.conv : to;
         import std.file : exists, isDir;
         import std.getopt;
         import std.math : isNaN;
         import std.path : baseName, expandTilde, extension, stripExtension;
         import std.typecons : Yes, No;
-        import tsv_utils.common.fieldlist : makeFieldListOptionHandler;
+        import tsv_utils.common.fieldlist;
+
+        bool helpVerbose = false;                  // --help-verbose
+        bool versionWanted = false;                // --V|version
+        string keyFieldsArg;                       // --k|key-fields
+
+        string keyFieldsOptionString = "k|key-fields";
 
         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
 
@@ -288,8 +293,13 @@ struct TsvSplitOptions
 
                 "l|lines-per-file", "NUM  Number of lines to write to each output file (excluding the header line).", &linesPerFile,
                 "n|num-files",      "NUM  Number of output files to generate.", &numFiles,
-                "k|key-fields",     "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
-                keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
+
+                //"k|key-fields",     "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
+                //keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
+
+                keyFieldsOptionString,
+                "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
+                &keyFieldsArg,
 
                 "dir",              "STR  Directory to write to. Default: Current working directory.", &dir,
                 "prefix",           "STR  Filename prefix. Default: 'part_'", &prefix,
@@ -328,41 +338,34 @@ struct TsvSplitOptions
                 return tuple(false, 0);
             }
 
-            /*
-             * Validation and derivations.
+            /* Remaining command line args are files.
+             */
+            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
+            cmdArgs.length = 1;
+
+            /* Validation and derivations - Do as much validation prior to header line
+             * processing as possible (avoids waiting on stdin).
+             *
+             * Note: keyFields depends on header line processing, but keyFieldsArg
+             * can be used to detect whether the command line argument was specified.
              */
 
+            enforce(!(headerInOut && headerIn),
+                    "Use only one of '--H|header' and '--I|header-in-only'.");
+
+            hasHeader = headerInOut || headerIn;
+
             enforce(linesPerFile != 0 || numFiles != 0,
                     "Either '--l|lines-per-file' or '--n|num-files' is required.");
 
             enforce(linesPerFile == 0 || numFiles == 0,
                     "'--l|lines-per-file' and '--n|num-files' cannot be used together.");
 
-            enforce(linesPerFile == 0 || keyFields.length == 0,
+            enforce(linesPerFile == 0 || keyFieldsArg.length == 0,
                     "'--l|lines-per-file' and '--k|key-fields' cannot be used together.");
 
             enforce(numFiles != 1, "'--n|num-files must be two or more.");
 
-            if (keyFields.length > 0)
-            {
-                if (keyFields.length == 1 && keyFields[0] == 0)
-                {
-                    keyIsFullLine = true;
-                }
-                else
-                {
-                    enforce(keyFields.all!(x => x != 0),
-                            "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
-
-                    keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
-                }
-            }
-
-            enforce(!(headerInOut && headerIn),
-                    "Use only one of '--H|header' and '--I|header-in-only'.");
-
-            hasHeader = headerInOut || headerIn;
-
             if (!dir.empty)
             {
                 dir = dir.expandTilde;
@@ -395,7 +398,6 @@ struct TsvSplitOptions
             immutable uint numReservedOpenFiles = 4;
             immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();
 
-
             enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles,
                     format("'--max-open-files' must be at least %d.",
                            numReservedOpenFiles + 1));
@@ -423,13 +425,6 @@ struct TsvSplitOptions
 
             maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;
 
-            /* Remaining command line args are files.
-             */
-            string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
-            cmdArgs.length = 1;
-            ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
-            inputSources = inputSourceRange(filepaths, readHeader);
-
             /* Suffix - If not provided, use the extension of the first input file.
              * No suffix if reading from standard input.
              */
@@ -472,6 +467,41 @@ struct TsvSplitOptions
                 }
             }
             assert(digitWidth != 0);
+
+            /*
+             * Create the inputSourceRange and perform header line processing.
+             */
+            ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
+            inputSources = inputSourceRange(filepaths, readHeader);
+
+            string[] headerFields;
+
+            if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]);
+
+            if (!keyFieldsArg.empty)
+            {
+                keyFields =
+                    keyFieldsArg
+                    .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
+                    (hasHeader, headerFields, keyFieldsOptionString)
+                    .array;
+            }
+
+            if (keyFields.length > 0)
+            {
+                if (keyFields.length == 1 && keyFields[0] == 0)
+                {
+                    keyIsFullLine = true;
+                }
+                else
+                {
+                    enforce(keyFields.all!(x => x != 0),
+                            "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
+
+                    keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
+                }
+            }
+
         }
         catch (Exception exc)
         {
@@ -495,7 +525,7 @@ unittest
     import std.path : buildPath;
 
     /* A dummy file is used so we don't have to worry about the cases where command
-     * line processing might open a file. Don't want to use stanard input for this,
+     * line processing might open a file. Don't want to use standard input for this,
      * at least in cases where it might try to read to get the header line.
      */
     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
@@ -577,7 +607,7 @@ unittest
     }
 
     /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first.
-     * This make sure there is no attempt to read standard input and that there won't be an
+     * This makes sure there is no attempt to read standard input and that there won't be an
      * open failure trying to find a file.
      */
     testSuffix(["unittest", "-n", "2"], "");

diff --git a/tsv-split/tests/gold/error_tests_1.txt b/tsv-split/tests/gold/error_tests_1.txt
@@ -55,7 +55,7 @@ input1x3.txt: line 3
 [tsv-split] Error processing command line arguments: '--n|num-files must be two or more.
 
 ====[tsv-split -n 2 -k 1.5 input4x58.tsv]====
-[tsv-split] Error processing command line arguments: [--k|key-fields] Unexpected '.' when converting from type string to type ulong
+[tsv-split] Error processing command line arguments: [--k|key-fields] Non-numeric field group: '1.5'. Use '--H|header' when using named field groups.
 
 ====[tsv-split -n 2 -k 0,1 input4x58.tsv]====
 [tsv-split] Error processing command line arguments: Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.
@@ -96,6 +96,15 @@ Error [tsv-split]: Not enough fields in line. File: input4x58.tsv, Line: 1
 ====[tsv-split -n 2 --suffix ab/cd input4x58.tsv]====
 [tsv-split] Error processing command line arguments: '--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.
 
+====[tsv-split -n 2 --key-fields c\-1 input4x58.tsv]====
+[tsv-split] Error processing command line arguments: [--k|key-fields] Non-numeric field group: 'c\-1'. Use '--H|header' when using named field groups.
+
+====[tsv-split -n 2 -H --key-fields c-1 input4x58.tsv]====
+[tsv-split] Error processing command line arguments: [--k|key-fields] Ranges with both numeric and named components are not supported: 'c-1'.
+
+====[tsv-split -n 2 -I --key-fields c-1 input4x58.tsv]====
+[tsv-split] Error processing command line arguments: [--k|key-fields] Ranges with both numeric and named components are not supported: 'c-1'.
+
 ====[ulimit -Sn 5 && tsv-split -s -n 101 -k 3 --max-open-files 6 ../input4x58.tsv]====
 [tsv-split] Error processing command line arguments: '--max-open-files' value (6) greater current system limit (5).
 Run 'ulimit -n' to see the soft limit.