diff --git a/cmd/config.go b/cmd/config.go index a056c24..81996ac 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -24,6 +24,7 @@ type Context struct { deltaFile afero.File recordCount int separator rune + lazyQuotes bool } // NewContext can take all CLI flags and create a cmd.Context @@ -39,13 +40,14 @@ func NewContext( baseFilename string, deltaFilename string, separator rune, + lazyQuotes bool, ) (*Context, error) { - baseRecordCount, err := getColumnsCount(fs, baseFilename, separator) + baseRecordCount, err := getColumnsCount(fs, baseFilename, separator, lazyQuotes) if err != nil { return nil, fmt.Errorf("error in base-file: %v", err) } - deltaRecordCount, err := getColumnsCount(fs, deltaFilename, separator) + deltaRecordCount, err := getColumnsCount(fs, deltaFilename, separator, lazyQuotes) if err != nil { return nil, fmt.Errorf("error in delta-file: %v", err) } @@ -81,6 +83,7 @@ func NewContext( deltaFile: deltaFile, recordCount: baseRecordCount, separator: separator, + lazyQuotes: lazyQuotes, } if err := ctx.validate(); err != nil { @@ -178,7 +181,7 @@ func assertAll(elements []int, assertFn func(element int) bool) bool { return true } -func getColumnsCount(fs afero.Fs, filename string, separator rune) (int, error) { +func getColumnsCount(fs afero.Fs, filename string, separator rune, lazyQuotes bool) (int, error) { base, err := fs.Open(filename) if err != nil { return 0, err @@ -186,6 +189,7 @@ func getColumnsCount(fs afero.Fs, filename string, separator rune) (int, error) defer base.Close() csvReader := csv.NewReader(base) csvReader.Comma = separator + csvReader.LazyQuotes = lazyQuotes record, err := csvReader.Read() if err != nil { if err == io.EOF { @@ -201,11 +205,12 @@ func getColumnsCount(fs afero.Fs, filename string, separator rune) (int, error) // that is needed to start the diff process func (c *Context) BaseDigestConfig() (digest.Config, error) { return digest.Config{ - Reader: c.baseFile, - Value: c.valueColumnPositions, - Key: c.primaryKeyPositions, - Include: c.includeColumnPositions, - Separator: c.separator, + Reader: c.baseFile, + Value: c.valueColumnPositions, + Key: c.primaryKeyPositions, + Include: c.includeColumnPositions, + Separator: c.separator, + LazyQuotes: c.lazyQuotes, }, nil } @@ -213,11 +218,12 @@ func (c *Context) BaseDigestConfig() (digest.Config, error) { // that is needed to start the diff process func (c *Context) DeltaDigestConfig() (digest.Config, error) { return digest.Config{ - Reader: c.deltaFile, - Value: c.valueColumnPositions, - Key: c.primaryKeyPositions, - Include: c.includeColumnPositions, - Separator: c.separator, + Reader: c.deltaFile, + Value: c.valueColumnPositions, + Key: c.primaryKeyPositions, + Include: c.includeColumnPositions, + Separator: c.separator, + LazyQuotes: c.lazyQuotes, }, nil } diff --git a/cmd/config_test.go b/cmd/config_test.go index 24c4ad1..1cadfee 100644 --- a/cmd/config_test.go +++ b/cmd/config_test.go @@ -47,6 +47,7 @@ func TestPrimaryKeyPositions(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) assert.Equal(t, tt.out, ctx.GetPrimaryKeys()) @@ -91,6 +92,7 @@ func TestValueColumnPositions(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) assert.Equal(t, tt.out, ctx.GetValueColumns()) @@ -117,6 +119,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "validation failed: specified format is not valid") @@ -133,6 +136,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) @@ -149,6 +153,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) @@ -168,6 +173,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "error in base-file: open "+string(os.PathSeparator)+"base.csv: file does not exist") }) @@ -189,6 +195,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "error in base-file: unable to process headers from csv file. EOF reached. invalid CSV file") }) @@ -210,6 +217,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "error in delta-file: unable to process headers from csv file. EOF reached. invalid CSV file") }) @@ -228,6 +236,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) }) @@ -256,6 +265,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "validation failed: --primary-key positions are out of bounds") @@ -272,6 +282,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "validation failed: --include positions are out of bounds") @@ -288,6 +299,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "validation failed: --columns positions are out of bounds") @@ -310,6 +322,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "base-file and delta-file columns count do not match") }) @@ -329,6 +342,7 @@ func TestNewContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.EqualError(t, err, "only one of --columns or --ignore-columns") @@ -353,6 +367,7 @@ func TestConfig_DigestConfig(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) @@ -388,6 +403,7 @@ func TestConfig_DigestConfig(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) diff --git a/cmd/root.go b/cmd/root.go index 88e0518..9c12549 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -76,6 +76,7 @@ Most suitable for csv files created from database tables`, baseFilename, deltaFilename, runeSeparator, + lazyQuotes, ) if err != nil { @@ -125,6 +126,7 @@ var ( includeColumnPositions []int format string separator string + lazyQuotes bool ) func init() { @@ -138,6 +140,7 @@ func init() { rootCmd.Flags().StringVarP(&separator, "separator", "s", ",", "use specific separator (\\t, or any one character string)") rootCmd.Flags().BoolVarP(&timed, "time", "", false, "Measure time") + rootCmd.Flags().BoolVarP(&lazyQuotes, "lazyquotes", "", false, "allow unescaped quotes") } func timeTrack(start time.Time, name string) { diff --git a/cmd/root_test.go b/cmd/root_test.go index 681db57..85a416a 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -43,6 +43,7 @@ func TestRunContext(t *testing.T) { "/base.csv", "/delta.csv", ',', + false, ) assert.NoError(t, err) diff --git a/examples/lazy_quotes.csv b/examples/lazy_quotes.csv new file mode 100644 index 0000000..4c47d59 --- /dev/null +++ b/examples/lazy_quotes.csv @@ -0,0 +1,6 @@ +15 12 wordpress".com com 207790 792348 wordpress".com com 15 12 207589 791634 +43 1 europa.eu eu 116613 353412 europa.eu eu 41 1 119129 359818 +69 48 "aol.com com 97543 225532 "aol.com com 70 49 97328 224491 +1615 905 proboards.com com 19833 33110 proboards.com com 1613 902 19835 33135 +1616 906 ccleaner.com com 19831 32507 ccleaner.com com 1614 903 19834 32463 +1617 907 doodle.com com 19827 32902 doodle.com com 1621 909 19787 32822 diff --git a/examples/lazy_quotes_delta.csv b/examples/lazy_quotes_delta.csv new file mode 100644 index 0000000..f475075 --- /dev/null +++ b/examples/lazy_quotes_delta.csv @@ -0,0 +1,4 @@ +15 12 wordpress".com com 207790 792348 wordpress".com com 15 12 207589 791634 +43 1 europa.eu eu 116613 353412 europa.eu eu 41 1 119129 359818 +69 1048 "aol.com com 97543 225532 "aol.com com 70 49 97328 224491 +24564 907 completely-newsite.com com 19827 32902 completely-newsite.com com 1621 909 19787 32822 diff --git a/go.mod b/go.mod index a74f26a..91756f0 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spf13/afero v1.1.2 github.com/spf13/cobra v0.0.5 - github.com/stretchr/testify v1.3.0 + github.com/stretchr/testify v1.4.0 golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa // indirect ) diff --git a/go.sum b/go.sum index 0b0dbf7..6ac221e 100644 --- a/go.sum +++ b/go.sum @@ -48,6 +48,8 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= @@ -59,4 +61,5 @@ golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/pkg/digest/config.go b/pkg/digest/config.go index 439e894..c8561f9 100644 --- a/pkg/digest/config.go +++ b/pkg/digest/config.go @@ -9,11 +9,12 @@ import "io" // Value: The Value positions that needs to be compared for diff // Include: Include these positions in output. It is Value positions by default. type Config struct { - Key Positions - Value Positions - Include Positions - Reader io.Reader - Separator rune + Key Positions + Value Positions + Include Positions + Reader io.Reader + Separator rune + LazyQuotes bool } // NewConfig creates an instance of Config struct. @@ -23,16 +24,18 @@ func NewConfig( valueColumns Positions, includeColumns Positions, separator rune, + lazyQuotes bool, ) *Config { if len(includeColumns) == 0 { includeColumns = valueColumns } return &Config{ - Reader: r, - Key: primaryKey, - Value: valueColumns, - Include: includeColumns, - Separator: separator, + Reader: r, + Key: primaryKey, + Value: valueColumns, + Include: includeColumns, + Separator: separator, + LazyQuotes: lazyQuotes, } } diff --git a/pkg/digest/diff_test.go b/pkg/digest/diff_test.go index f376bb4..65ed88f 100644 --- a/pkg/digest/diff_test.go +++ b/pkg/digest/diff_test.go @@ -24,15 +24,17 @@ func TestDiff(t *testing.T) { t.Run("default config", func(t *testing.T) { baseConfig := &digest.Config{ - Reader: strings.NewReader(base), - Key: []int{0}, - Separator: ',', + Reader: strings.NewReader(base), + Key: []int{0}, + Separator: ',', + LazyQuotes: false, } deltaConfig := &digest.Config{ - Reader: strings.NewReader(delta), - Key: []int{0}, - Separator: ',', + Reader: strings.NewReader(delta), + Key: []int{0}, + Separator: ',', + LazyQuotes: false, } expected := digest.Differences{ @@ -59,4 +61,51 @@ func TestDiff(t *testing.T) { assert.NoError(t, err) assert.Equal(t, expected, actual) }) + + deltaLazyQuotes := `1,col-1,col-2,col-3,one-value +2,col-1,col-2,col-3,two-value-modified +4,col-1,col-2,col-3,four"-added +100,col-1-modified,col-2,col-3,hundred-value-modified +5,col-1,col-2,col-3,five"-added +` + + t.Run("lazy quotes in delta config", func(t *testing.T) { + baseConfig := &digest.Config{ + Reader: strings.NewReader(base), + Key: []int{0}, + Separator: ',', + LazyQuotes: false, + } + + deltaConfig := &digest.Config{ + Reader: strings.NewReader(deltaLazyQuotes), + Key: []int{0}, + Separator: ',', + LazyQuotes: true, + } + + expected := digest.Differences{ + Additions: []digest.Addition{ + strings.Split("4,col-1,col-2,col-3,four\"-added", ","), + strings.Split("5,col-1,col-2,col-3,five\"-added", ","), + }, + Modifications: []digest.Modification{ + { + Current: strings.Split("2,col-1,col-2,col-3,two-value-modified", ","), + Original: strings.Split("2,col-1,col-2,col-3,two-value", ","), + }, + { + Current: strings.Split("100,col-1-modified,col-2,col-3,hundred-value-modified", ","), + Original: strings.Split("100,col-1,col-2,col-3,hundred-value", ","), + }, + }, + Deletions: []digest.Deletion{ + strings.Split("3,col-1,col-2,col-3,three-value", ","), + }, + } + + actual, err := digest.Diff(*baseConfig, *deltaConfig) + assert.NoError(t, err) + assert.Equal(t, expected, actual) + }) } diff --git a/pkg/digest/digest.go b/pkg/digest/digest.go index 6772653..e140fb8 100644 --- a/pkg/digest/digest.go +++ b/pkg/digest/digest.go @@ -34,6 +34,7 @@ func Create(config *Config) (map[uint64]uint64, map[uint64][]string, error) { maxProcs := runtime.NumCPU() reader := csv.NewReader(config.Reader) reader.Comma = config.Separator + reader.LazyQuotes = config.LazyQuotes output := make(map[uint64]uint64) sourceMap := make(map[uint64][]string) diff --git a/pkg/digest/digest_test.go b/pkg/digest/digest_test.go index 1463c54..e0d58e6 100644 --- a/pkg/digest/digest_test.go +++ b/pkg/digest/digest_test.go @@ -94,26 +94,28 @@ func TestNewConfig(t *testing.T) { include := digest.Positions{0, 1} t.Run("should create config from given params", func(t *testing.T) { - conf := digest.NewConfig(r, primaryColumns, values, include, ',') + conf := digest.NewConfig(r, primaryColumns, values, include, ',', false) expectedConf := digest.Config{ - Reader: r, - Key: primaryColumns, - Value: values, - Include: include, - Separator: ',', + Reader: r, + Key: primaryColumns, + Value: values, + Include: include, + Separator: ',', + LazyQuotes: false, } assert.Equal(t, expectedConf, *conf) }) t.Run("should use valueColumns as includeColumns for includes not specified", func(t *testing.T) { - conf := digest.NewConfig(r, primaryColumns, values, nil, ',') + conf := digest.NewConfig(r, primaryColumns, values, nil, ',', false) expectedConf := digest.Config{ - Reader: r, - Key: primaryColumns, - Value: values, - Include: values, - Separator: ',', + Reader: r, + Key: primaryColumns, + Value: values, + Include: values, + Separator: ',', + LazyQuotes: false, } assert.Equal(t, expectedConf, *conf) diff --git a/pkg/digest/engine.go b/pkg/digest/engine.go index ed1e94c..fd67f79 100644 --- a/pkg/digest/engine.go +++ b/pkg/digest/engine.go @@ -60,6 +60,7 @@ func (e Engine) StreamDigests() (chan []Digest, chan error) { wg := &sync.WaitGroup{} reader := csv.NewReader(e.config.Reader) reader.Comma = e.config.Separator + reader.LazyQuotes = e.config.LazyQuotes for { lines, eofReached, err := getNextNLines(reader)