Skip to content
This repository has been archived by the owner on Nov 9, 2024. It is now read-only.

Commit

Permalink
working on filter system for processing.
Browse files Browse the repository at this point in the history
  • Loading branch information
AnalogJ committed Feb 16, 2020
1 parent 12fc6b4 commit f9cc9ee
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 12 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ require (
github.com/mattn/go-colorable v0.0.9 // indirect
github.com/mattn/go-isatty v0.0.10 // indirect
github.com/rogpeppe/go-internal v1.5.2 // indirect
github.com/sabhiram/go-gitignore v0.0.0-20180611051255-d3107576ba94
github.com/sirupsen/logrus v1.4.2
github.com/spf13/pflag v1.0.5 // indirect
github.com/streadway/amqp v0.0.0-20190827072141-edfb9018d271
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ github.com/rogpeppe/go-internal v1.4.0/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTE
github.com/rogpeppe/go-internal v1.5.2 h1:qLvObTrvO/XRCqmkKxUlOBc48bI3efyDuAZe25QiF0w=
github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/sabhiram/go-gitignore v0.0.0-20180611051255-d3107576ba94 h1:G04eS0JkAIVZfaJLjla9dNxkJCPiKIGZlw9AfOhzOD0=
github.com/sabhiram/go-gitignore v0.0.0-20180611051255-d3107576ba94/go.mod h1:b18R55ulyQ/h3RaWyloPyER7fWQVZvimKKhnI5OfrJQ=
github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
Expand Down
46 changes: 46 additions & 0 deletions pkg/model/filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package model

import (
"github.com/sabhiram/go-gitignore"
)

type Filter struct {
Include []string `json:"include"`
Exclude []string `json:"exclude"`

incMatcher *ignore.GitIgnore
excMatcher *ignore.GitIgnore
}

func (flt *Filter) ValidPath(pathToTest string) bool {

if flt.incMatcher == nil {
incMatcher, err := ignore.CompileIgnoreLines(flt.Include...)
if err != nil {
return true
}
flt.incMatcher = incMatcher
}
if flt.excMatcher == nil {
excMatcher, err := ignore.CompileIgnoreLines(flt.Exclude...)
if err != nil {
return true
}
flt.excMatcher = excMatcher
}

//includes always override excludes
matched := flt.incMatcher.MatchesPath(pathToTest)
if matched {
return true //if the file matches a pattern in "include", we need to tell the processor to continue
}

//test excludes next
matched = flt.excMatcher.MatchesPath(pathToTest)
if matched {
return false //if the file matches a pattern in "exclude", we need to tell the processor to skip it
}

//by default, we'll include all files
return true
}
103 changes: 103 additions & 0 deletions pkg/model/filter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package model

import (
"encoding/json"
"github.com/stretchr/testify/assert"
"testing"
)

func TestFilter_ValidPath(t *testing.T) {

filter := Filter{
Include: []string{"hello", "helloworld"},
Exclude: []string{".DS_Store", ".*"},
}

assert.True(t, filter.ValidPath("subdirectory/hello"))

assert.False(t, filter.ValidPath(".DS_Store"))
assert.False(t, filter.ValidPath("subdirectory/.DS_Store"))
assert.False(t, filter.ValidPath("subdirectory/.anything"))
assert.False(t, filter.ValidPath(".anything"))

assert.True(t, filter.ValidPath("hello/.DS_Store"))
}

func TestFilter_FromJson(t *testing.T) {

bodyJson := `
{
"include": [
"*.doc",
"*.docx",
"*.xls",
"*.xlsx",
"*.ppt",
"*.pptx",
"*.pages",
"*.numbers",
"*.key",
"*.pdf",
"*.rtf",
"*.md",
"*.jpg",
"*.jpeg",
"*.png",
"*.gif",
"*.webp",
"*.tiff",
"*.tif",
"*.html"
],
"exclude": [
".*",
".DS_Store",
".AppleDouble",
".LSOverride",
"._*",
".DocumentRevisions-V100",
".fseventsd",
".Spotlight-V100",
".TemporaryItems",
".Trashes",
".VolumeIcon.icns",
".com.apple.timemachine.donotpresent",
".AppleDB",
".AppleDesktop",
"Network Trash Folder",
"Temporary Items",
".apdisk",
"Thumbs.db",
"Thumbs.db:encryptable",
"ehthumbs.db",
"ehthumbs_vista.db",
"*.stackdump",
"Desktop.ini",
"desktop.ini",
"*.cab",
"*.msi",
"*.msix",
"*.msm",
"*.msp",
"*.lnk"
]
}
`

var filter Filter
err := json.Unmarshal([]byte(bodyJson), &filter)
assert.Nil(t, err)

assert.True(t, filter.ValidPath("subdirectory/hello.png"))
assert.True(t, filter.ValidPath("subdirectory/hello.jpeg"))
assert.True(t, filter.ValidPath("subdirectory/hello.gif"))
assert.True(t, filter.ValidPath("subdirectory/embedded.msi/inpath"))

assert.False(t, filter.ValidPath(".DS_Store"))
assert.False(t, filter.ValidPath("subdirectory/.DS_Store"))
assert.False(t, filter.ValidPath("subdirectory/Thumbs.db"))
assert.False(t, filter.ValidPath("helloworld/testing.cab"))
assert.False(t, filter.ValidPath("subdirectory/.Trashes"))
}
31 changes: 31 additions & 0 deletions pkg/processor/api/filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package api

import (
"encoding/json"
"github.com/analogj/lodestone-processor/pkg/model"
"io/ioutil"
"net/http"
"net/url"
)

func GetIncludeExcludeData(apiEndpoint *url.URL) (model.Filter, error) {

//manipulate the path
apiEndpoint.Path = "/api/v1/data/filetypes.json"

resp, err := http.Get(apiEndpoint.String())
if err != nil {
return model.Filter{}, err
}
defer resp.Body.Close()

bodyJson, err := ioutil.ReadAll(resp.Body)
if err != nil {
return model.Filter{}, err
}

var filter model.Filter
err = json.Unmarshal(bodyJson, &filter)

return filter, err
}
26 changes: 20 additions & 6 deletions pkg/processor/document/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type DocumentProcessor struct {
elasticsearchMappingOverride string
elasticsearchClient *elasticsearch.Client
mappings *packr.Box
filter *model.Filter
}

func CreateDocumentProcessor(apiEndpoint string, storageThumbnailBucket string, tikaEndpoint string, elasticsearchEndpoint string, elasticsearchIndex string, elasticsearchMapping string) (DocumentProcessor, error) {
Expand All @@ -46,6 +47,12 @@ func CreateDocumentProcessor(apiEndpoint string, storageThumbnailBucket string,
return DocumentProcessor{}, err
}

//retrieve the filters (include/excludes) from the API
filterData, err := api.GetIncludeExcludeData(apiEndpointUrl)
if err != nil {
return DocumentProcessor{}, err
}

tikaEndpointUrl, err := url.Parse(tikaEndpoint)
if err != nil {
return DocumentProcessor{}, err
Expand All @@ -66,6 +73,7 @@ func CreateDocumentProcessor(apiEndpoint string, storageThumbnailBucket string,
elasticsearchIndex: elasticsearchIndex,
elasticsearchMappingOverride: elasticsearchMapping,
mappings: &box,
filter: &filterData,
}

//ensure the elastic search index exists (do this once on startup)
Expand Down Expand Up @@ -93,23 +101,29 @@ func CreateDocumentProcessor(apiEndpoint string, storageThumbnailBucket string,

func (dp *DocumentProcessor) Process(body []byte) error {

//make a temporary directory for subsequent processing (original file download, and thumb generation)
dir, err := ioutil.TempDir("", "doc")
var event model.S3Event
err := json.Unmarshal(body, &event)
if err != nil {
return err
}
defer os.RemoveAll(dir) // clean up

var event model.S3Event
err = json.Unmarshal(body, &event)
docBucketName, docBucketPath, err := api.GenerateStoragePath(event)
if err != nil {
return err
}

docBucketName, docBucketPath, err := api.GenerateStoragePath(event)
//determine if we should even be processing this document
includeDocument := dp.filter.ValidPath(docBucketPath)
if !includeDocument {
log.Infof("Ignoring document, matches exclude pattern (%s, %s)", docBucketName, docBucketPath)
}

//make a temporary directory for subsequent processing (original file download, and thumb generation)
dir, err := ioutil.TempDir("", "doc")
if err != nil {
return err
}
defer os.RemoveAll(dir) // clean up

if event.Records[0].EventName == "s3:ObjectRemoved:Delete" {
log.Debugln("Attempting to delete file")
Expand Down
28 changes: 22 additions & 6 deletions pkg/processor/thumbnail/thumbnail.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"github.com/analogj/lodestone-processor/pkg/model"
"github.com/analogj/lodestone-processor/pkg/processor/api"
log "github.com/sirupsen/logrus"
"gopkg.in/gographics/imagick.v2/imagick"
"io/ioutil"
"math"
Expand All @@ -15,6 +16,7 @@ import (

type ThumbnailProcessor struct {
apiEndpoint *url.URL
filter *model.Filter
}

func CreateThumbnailProcessor(apiEndpoint string) (ThumbnailProcessor, error) {
Expand All @@ -24,31 +26,45 @@ func CreateThumbnailProcessor(apiEndpoint string) (ThumbnailProcessor, error) {
return ThumbnailProcessor{}, err
}

//retrieve the filters (include/excludes) from the API
filterData, err := api.GetIncludeExcludeData(apiEndpointUrl)
if err != nil {
return ThumbnailProcessor{}, err
}

tp := ThumbnailProcessor{
apiEndpoint: apiEndpointUrl,
filter: &filterData,
}

return tp, nil
}

func (tp *ThumbnailProcessor) Process(body []byte) error {
//make a temporary directory for subsequent processing (original file download, and thumb generation)
dir, err := ioutil.TempDir("", "thumb")

var event model.S3Event
err := json.Unmarshal(body, &event)
if err != nil {
return err
}
defer os.RemoveAll(dir) // clean up

var event model.S3Event
err = json.Unmarshal(body, &event)
docBucketName, docBucketPath, err := api.GenerateStoragePath(event)
if err != nil {
return err
}

docBucketName, docBucketPath, err := api.GenerateStoragePath(event)
//determine if we should even be processing this document
includeDocument := tp.filter.ValidPath(docBucketPath)
if !includeDocument {
log.Infof("Ignoring document, matches exclude pattern (%s, %s)", docBucketName, docBucketPath)
}

//make a temporary directory for subsequent processing (original file download, and thumb generation)
dir, err := ioutil.TempDir("", "thumb")
if err != nil {
return err
}
defer os.RemoveAll(dir) // clean up

filePath, err := api.GetFile(tp.apiEndpoint, docBucketName, docBucketPath, dir)
if err != nil {
Expand Down

0 comments on commit f9cc9ee

Please sign in to comment.