From 7adecca8fcb67db7770524685c0aa3993d081a9d Mon Sep 17 00:00:00 2001 From: aswinkarthik93 Date: Mon, 16 Apr 2018 23:29:48 +0530 Subject: [PATCH] Add feature to specify few columns to consider for value hash --- cmd/config.go | 34 +++++++++++++++++++----------- cmd/config_test.go | 25 +++++++++++++++++----- cmd/run.go | 19 +++++------------ pkg/digest/digest.go | 32 +++++++++++++++++----------- pkg/digest/digest_test.go | 12 ++++++----- pkg/digest/positions.go | 24 +++++++++++++++++++++ pkg/digest/positions_test.go | 41 ++++++++++++++++++++++++++++++++++++ 7 files changed, 139 insertions(+), 48 deletions(-) create mode 100644 pkg/digest/positions.go create mode 100644 pkg/digest/positions_test.go diff --git a/cmd/config.go b/cmd/config.go index ee4791b..a30dc64 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -4,6 +4,8 @@ import ( "io" "log" "os" + + "github.com/aswinkarthik93/csvdiff/pkg/digest" ) var config Config @@ -13,33 +15,41 @@ func init() { } type Config struct { - KeyPositions []int - Base string - Delta string - Additions string - Modifications string + PrimaryKeyPositions []int + ValueColumnPositions []int + Base string + Delta string + Additions string + Modifications string } -func (c Config) GetKeyPositions() []int { - if len(c.KeyPositions) > 0 { - return c.KeyPositions +func (c *Config) GetPrimaryKeys() digest.Positions { + if len(c.PrimaryKeyPositions) > 0 { + return c.PrimaryKeyPositions } return []int{0} } -func (c Config) GetBaseReader() io.Reader { +func (c *Config) GetValueColumns() digest.Positions { + if len(c.ValueColumnPositions) > 0 { + return c.ValueColumnPositions + } + return []int{} +} + +func (c *Config) GetBaseReader() io.Reader { return getReader(c.Base) } -func (c Config) GetDeltaReader() io.Reader { +func (c *Config) GetDeltaReader() io.Reader { return getReader(c.Delta) } -func (c Config) AdditionsWriter() io.WriteCloser { +func (c *Config) AdditionsWriter() io.WriteCloser { return getWriter(c.Additions) } -func (c Config) ModificationsWriter() io.WriteCloser { +func (c *Config) ModificationsWriter() io.WriteCloser { return getWriter(c.Modifications) } diff --git a/cmd/config_test.go b/cmd/config_test.go index d77ead2..30338d6 100644 --- a/cmd/config_test.go +++ b/cmd/config_test.go @@ -3,13 +3,28 @@ package cmd import ( "testing" + "github.com/aswinkarthik93/csvdiff/pkg/digest" "github.com/stretchr/testify/assert" ) -func TestGetKeyPositions(t *testing.T) { - config := Config{KeyPositions: []int{0, 1}} - assert.Equal(t, []int{0, 1}, config.GetKeyPositions()) +func TestPrimaryKeyPositions(t *testing.T) { + config := Config{PrimaryKeyPositions: []int{0, 1}} + assert.Equal(t, digest.Positions([]int{0, 1}), config.GetPrimaryKeys()) - config = Config{KeyPositions: []int{}} - assert.Equal(t, []int{0}, config.GetKeyPositions()) + config = Config{PrimaryKeyPositions: []int{}} + assert.Equal(t, digest.Positions([]int{0}), config.GetPrimaryKeys()) + + config = Config{} + assert.Equal(t, digest.Positions([]int{0}), config.GetPrimaryKeys()) +} + +func TestValueColumnPositions(t *testing.T) { + config := Config{ValueColumnPositions: []int{0, 1}} + assert.Equal(t, digest.Positions([]int{0, 1}), config.GetValueColumns()) + + config = Config{ValueColumnPositions: []int{}} + assert.Equal(t, digest.Positions([]int{}), config.GetValueColumns()) + + config = Config{} + assert.Equal(t, digest.Positions([]int{}), config.GetValueColumns()) } diff --git a/cmd/run.go b/cmd/run.go index 50f9b9b..68bff69 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -19,7 +19,6 @@ import ( "fmt" "io" "log" - "os" "sync" "github.com/aswinkarthik93/csvdiff/pkg/digest" @@ -54,7 +53,8 @@ func init() { digestCmd.Flags().StringVarP(&config.Base, "base", "b", "", "The base csv file") digestCmd.Flags().StringVarP(&config.Delta, "delta", "d", "", "The delta csv file") - digestCmd.Flags().IntSliceVarP(&config.KeyPositions, "key-positions", "k", []int{0}, "Primary key positions of the Input CSV as comma separated values Eg: 1,2") + digestCmd.Flags().IntSliceVarP(&config.PrimaryKeyPositions, "primary-key", "p", []int{0}, "Primary key positions of the Input CSV as comma separated values Eg: 1,2") + digestCmd.Flags().IntSliceVarP(&config.ValueColumnPositions, "value-columns", "", []int{}, "Value key positions of the Input CSV as comma separated values Eg: 1,2. Default is entire row") digestCmd.Flags().BoolVarP(&debug, "debug", "", false, "Debug mode") digestCmd.Flags().StringVarP(&config.Additions, "additions", "a", "STDOUT", "Output stream for the additions in delta file") digestCmd.Flags().StringVarP(&config.Modifications, "modifications", "m", "STDOUT", "Output stream for the modifications in delta file") @@ -70,18 +70,9 @@ func run() { log.Fatal(err) } - baseConfig := digest.DigestConfig{ - KeyPositions: config.GetKeyPositions(), - Reader: config.GetBaseReader(), - Writer: os.Stdout, - } + baseConfig := digest.NewConfig(config.GetBaseReader(), false, config.GetPrimaryKeys(), config.GetValueColumns()) - deltaConfig := digest.DigestConfig{ - KeyPositions: config.GetKeyPositions(), - Reader: config.GetDeltaReader(), - Writer: os.Stdout, - SourceMap: true, - } + deltaConfig := digest.NewConfig(config.GetDeltaReader(), true, config.GetPrimaryKeys(), config.GetValueColumns()) var wg sync.WaitGroup baseChannel := make(chan message) @@ -104,7 +95,7 @@ type message struct { sourceMap map[uint64]string } -func generateInBackground(name string, config digest.DigestConfig, wg *sync.WaitGroup, channel chan<- message) { +func generateInBackground(name string, config *digest.Config, wg *sync.WaitGroup, channel chan<- message) { digest, sourceMap, err := digest.Create(config) if err != nil { panic(err) diff --git a/pkg/digest/digest.go b/pkg/digest/digest.go index cebce29..e293826 100644 --- a/pkg/digest/digest.go +++ b/pkg/digest/digest.go @@ -8,6 +8,8 @@ import ( "github.com/cespare/xxhash" ) +const Separator = "," + // Digest represents the binding of the key of each csv line // and the digest that gets created for the entire line type Digest struct { @@ -18,28 +20,34 @@ type Digest struct { // CreateDigest creates a Digest for each line of csv. // There will be one Digest per line -func CreateDigest(csv []string, keyPositions []int) Digest { - keyCsv := make([]string, len(keyPositions)) - for i, pos := range keyPositions { - keyCsv[i] = csv[pos] - } - - row := strings.Join(csv, ",") - key := xxhash.Sum64String(strings.Join(keyCsv, ",")) - digest := xxhash.Sum64String(row) +func CreateDigest(csv []string, pKey Positions, pRow Positions) Digest { + row := strings.Join(csv, Separator) + key := xxhash.Sum64String(pKey.MapToValue(csv)) + digest := xxhash.Sum64String(pRow.MapToValue(csv)) return Digest{Key: key, Value: digest, Row: row} } -type DigestConfig struct { +type Config struct { KeyPositions []int + Key Positions + Value Positions Reader io.Reader Writer io.Writer SourceMap bool } -func Create(config DigestConfig) (map[uint64]uint64, map[uint64]string, error) { +func NewConfig(r io.Reader, createSourceMap bool, primaryKey Positions, valueColumns Positions) *Config { + return &Config{ + Reader: r, + SourceMap: createSourceMap, + Key: primaryKey, + Value: valueColumns, + } +} + +func Create(config *Config) (map[uint64]uint64, map[uint64]string, error) { reader := csv.NewReader(config.Reader) output := make(map[uint64]uint64) @@ -52,7 +60,7 @@ func Create(config DigestConfig) (map[uint64]uint64, map[uint64]string, error) { } return nil, nil, err } - digest := CreateDigest(line, config.KeyPositions) + digest := CreateDigest(line, config.Key, config.Value) output[digest.Key] = digest.Value if config.SourceMap { sourceMap[digest.Key] = digest.Row diff --git a/pkg/digest/digest_test.go b/pkg/digest/digest_test.go index aeca93e..cc4604c 100644 --- a/pkg/digest/digest_test.go +++ b/pkg/digest/digest_test.go @@ -16,26 +16,27 @@ func TestCreateDigest(t *testing.T) { expectedDigest := Digest{Key: firstKey, Value: firstLineDigest, Row: firstLine} - actualDigest := CreateDigest(strings.Split(firstLine, ","), []int{0}) + actualDigest := CreateDigest(strings.Split(firstLine, Separator), []int{0}, []int{}) assert.Equal(t, expectedDigest, actualDigest) } func TestDigestForFile(t *testing.T) { - firstLine := "1,first-line" + firstLine := "1,first-line,some-columne,friday" firstKey := xxhash.Sum64String("1") firstDigest := xxhash.Sum64String(firstLine) - secondLine := "2,second-line" + secondLine := "2,second-line,nobody-needs-this,saturday" secondKey := xxhash.Sum64String("2") secondDigest := xxhash.Sum64String(secondLine) var outputBuffer bytes.Buffer - testConfig := DigestConfig{ + testConfig := &Config{ Reader: strings.NewReader(firstLine + "\n" + secondLine), Writer: &outputBuffer, KeyPositions: []int{0}, + Key: []int{0}, SourceMap: true, } @@ -49,10 +50,11 @@ func TestDigestForFile(t *testing.T) { assert.Equal(t, expectedSourceMap, sourceMap) // No source map - testConfigWithoutSourceMap := DigestConfig{ + testConfigWithoutSourceMap := &Config{ Reader: strings.NewReader(firstLine + "\n" + secondLine), Writer: &outputBuffer, KeyPositions: []int{0}, + Key: []int{0}, SourceMap: false, } diff --git a/pkg/digest/positions.go b/pkg/digest/positions.go new file mode 100644 index 0000000..cbbb5ff --- /dev/null +++ b/pkg/digest/positions.go @@ -0,0 +1,24 @@ +package digest + +import "strings" + +type Positions []int + +func (p Positions) MapToValue(csv []string) string { + if p.Length() == 0 { + return strings.Join(csv, Separator) + } + output := make([]string, p.Length()) + for i, pos := range p.Items() { + output[i] = csv[pos] + } + return strings.Join(output, Separator) +} + +func (p Positions) Length() int { + return len([]int(p)) +} + +func (p Positions) Items() []int { + return []int(p) +} diff --git a/pkg/digest/positions_test.go b/pkg/digest/positions_test.go new file mode 100644 index 0000000..90b0c2f --- /dev/null +++ b/pkg/digest/positions_test.go @@ -0,0 +1,41 @@ +package digest + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestPositionsMapValues(t *testing.T) { + positions := Positions([]int{0, 3}) + csv := []string{"zero", "one", "two", "three"} + + actual := positions.MapToValue(csv) + expected := "zero,three" + + assert.Equal(t, expected, actual) +} + +func TestPositionsMapValuesReturnsCompleteStringCsvIfEmpty(t *testing.T) { + positions := Positions([]int{}) + csv := []string{"zero", "one", "two", "three"} + + actual := positions.MapToValue(csv) + expected := strings.Join(csv, Separator) + + assert.Equal(t, expected, actual) +} + +func TestPositionsLength(t *testing.T) { + positions := Positions([]int{0, 3}) + + assert.Equal(t, 2, positions.Length()) +} + +func TestPositionsItems(t *testing.T) { + items := []int{0, 3} + positions := Positions(items) + + assert.Equal(t, items, positions.Items()) +}