Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 64bf4da
Author: Ross Spencer <all.along.the.watchtower2001@gmail.com>
Date:   Mon Mar 20 15:12:56 2023 +0000

    miscellaneous edit to prompt a merge check

commit 706209d
Merge: dcb15c2 eb6f061
Author: Richard Lehane <richard.lehane@gmail.com>
Date:   Mon Mar 20 12:42:20 2023 +0100

    Merge branch 'develop' into dev/add-pronom-type

commit dcb15c2
Author: Richard Lehane <richard@itforarchivists.com>
Date:   Mon Mar 20 08:12:50 2023 +0100

    fix indexes used by droid writer

commit c95e02d
Author: Richard Lehane <richard@itforarchivists.com>
Date:   Sun Mar 19 22:58:45 2023 +0100

    add "noclass" flag to allow omitting format class

commit b958528
Author: Richard Lehane <richard@itforarchivists.com>
Date:   Sun Mar 19 13:22:00 2023 +0100

    use Limit

commit 957c2e7
Author: Ross Spencer <all.along.the.watchtower2001@gmail.com>
Date:   Sun Feb 5 21:07:14 2023 +0100

    Add test for DROID CSV header output

    Ensures that the DROID header doesn't change in code unless it
    is explicitly made to do so.

commit 9f94a77
Author: Ross Spencer <all.along.the.watchtower2001@gmail.com>
Date:   Wed Jan 4 16:50:29 2023 +0100

    Create in-memory filesystem for PRONOM skeletons

    We can avoid writing to disk and make the tests here more portable by
    reading from an in-memory filesystem. The skeletons themselves are
    small and so can be easily stored in-line as strings and then turned
    into byte objects.

    Given the refactor to in-memory objects, we also take the opportunity
    to add a file that won't identify with the minimal PRONOM signature
    file and PRONOM reports. Type should be a nil-string as with many
    of the other fields.

commit e27bb70
Author: Ross Spencer <all.along.the.watchtower2001@gmail.com>
Date:   Wed Dec 28 12:47:55 2022 +0100

    Linting fixes

    PRONOM identifier related linting fixes for the different source
    files touched by the PRONOM types additions.

commit 2bdc899
Author: Ross Spencer <all.along.the.watchtower2001@gmail.com>
Date:   Tue Dec 27 18:33:22 2022 +0100

    Add tests for PRONOM types work

    Tests are added for the PRONOM types work along with new helper
    functions for making Siegfried tests more discrete and maintainable.

commit 0b02110
Author: Ross Spencer <all.along.the.watchtower2001@gmail.com>
Date:   Tue Dec 27 17:45:28 2022 +0100

    Add format type to Siegfried PRONOM output
  • Loading branch information
richardlehane committed Mar 20, 2023
1 parent 457c7b3 commit 98516b1
Show file tree
Hide file tree
Showing 15 changed files with 520 additions and 68 deletions.
4 changes: 4 additions & 0 deletions cmd/roy/roy.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ var (
noxml = build.Bool("noxml", false, "skip XML matcher")
noriff = build.Bool("noriff", false, "skip RIFF matcher")
noreports = build.Bool("noreports", false, "build directly from DROID file rather than PRONOM reports")
noclass = build.Bool("noclass", false, "omit format classes from the signature file")
doubleup = build.Bool("doubleup", false, "include byte signatures for formats that also have container signatures")
rng = build.Int("range", config.Range(), "define a maximum range for segmentation")
distance = build.Int("distance", config.Distance(), "define a maximum distance for segmentation")
Expand Down Expand Up @@ -416,6 +417,9 @@ the DROID signature file you should also include a regular signature extension
if *noreports {
opts = append(opts, config.SetNoReports())
}
if *noclass {
opts = append(opts, config.SetNoClass())
}
if *doubleup {
opts = append(opts, config.SetDoubleUp())
}
Expand Down
6 changes: 3 additions & 3 deletions cmd/sf/longpath.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
}
if err != nil {
if coerr {
printFile(ctxts, gf(path, "", time.Time{}, 0), WalkError{path, err})
printFile(ctxts, gf(path, "", time.Time{}, 0), walkError{path, err})
return nil
}
return WalkError{path, err}
return walkError{path, err}
}
if info.IsDir() {
if norecurse && path != root {
Expand All @@ -50,7 +50,7 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
}
// zero user read permissions mask, octal 400 (decimal 256)
if !info.Mode().IsRegular() || info.Mode()&256 == 0 {
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), ModeError(info.Mode()))
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), modeError(info.Mode()))
return nil
}
identifyFile(gf(path, "", info.ModTime(), info.Size()), ctxts, gf)
Expand Down
6 changes: 3 additions & 3 deletions cmd/sf/longpath_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
info, err = retryStat(path, err) // retry stat in case is a windows long path error
if err != nil {
if coerr {
printFile(ctxts, gf(path, "", time.Time{}, 0), WalkError{path, err})
printFile(ctxts, gf(path, "", time.Time{}, 0), walkError{path, err})
return nil
}
return WalkError{path, err}
return walkError{path, err}
}
lp, sp = longpath(path), path
retry = true
Expand All @@ -107,7 +107,7 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
return nil
}
if !info.Mode().IsRegular() {
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), ModeError(info.Mode()))
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), modeError(info.Mode()))
return nil
}
identifyFile(gf(shortpath(path, orig), "", info.ModTime(), info.Size()), ctxts, gf)
Expand Down
186 changes: 186 additions & 0 deletions cmd/sf/pronom_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package main

import (
"encoding/hex"
"path/filepath"
"reflect"
"sort"
"testing"
"testing/fstest"

"github.com/richardlehane/siegfried"
"github.com/richardlehane/siegfried/pkg/config"
"github.com/richardlehane/siegfried/pkg/pronom"
)

var DataPath string = filepath.Join("..", "..", "cmd", "roy", "data")

// pronomIdentificationTests provides our structure for table driven tests.
type pronomIdentificationTests struct {
identiifer string
puid string
label string
version string
mime string
types string
details string
error string
}

var skeletons = make(map[string]*fstest.MapFile)

var minimalPronom = []string{"fmt/1", "fmt/3", "fmt/5", "fmt/11", "fmt/14"}

// Populate the global skeletons map from string-based byte-sequences to
// save having to store skeletons on disk and read from them.
func makeSkeletons() {
var files = make(map[string]string)
files["fmt-11-signature-id-58.png"] = "89504e470d0a1a0a0000000d494844520000000049454e44ae426082"
files["fmt-14-signature-id-123.pdf"] = "255044462d312e302525454f46"
files["fmt-1-signature-id-1032.wav"] = ("" +
"524946460000000057415645000000000000000000000000000000000000" +
"000062657874000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"00000000000000000000000000000000000000000000000000000000" +
"")
files["fmt-5-signature-id-51.avi"] = ("" +
"524946460000000041564920000000000000000000000000000000000000" +
"00004c495354000000006864726c61766968000000000000000000000000" +
"00000000000000004c495354000000006d6f7669" +
"")
files["fmt-3-signature-id-18.gif"] = "4749463837613b"
files["badf00d.unknown"] = "badf00d"
for key, val := range files {
data, _ := hex.DecodeString(val)
skeletons[key] = &fstest.MapFile{Data: []byte(data)}
}
}

var pronomIDs = []pronomIdentificationTests{
{
"pronom",
"UNKNOWN",
"",
"",
"",
"",
"",
"no match",
},
{
"pronom",
"fmt/1",
"Broadcast WAVE",
"0 Generic",
"audio/x-wav",
"Audio",
"extension match wav; byte match at [[0 12] [32 356]]",
"",
},
{
"pronom",
"fmt/11",
"Portable Network Graphics",
"1.0",
"image/png",
"Image (Raster)",
"extension match png; byte match at [[0 16] [16 12]]",
"",
},
{
"pronom",
"fmt/14",
"Acrobat PDF 1.0 - Portable Document Format",
"1.0",
"application/pdf",
"Page Description",
"extension match pdf; byte match at [[0 8] [8 5]]",
"",
},
{
"pronom",
"fmt/3",
"Graphics Interchange Format",
"87a",
"image/gif",
"Image (Raster)",
"extension match gif; byte match at [[0 6] [6 1]]",
"",
},
{
"pronom",
"fmt/5",
"Audio/Video Interleaved Format",
"",
"video/x-msvideo",
"Audio, Video",
"extension match avi; byte match at [[0 12] [32 16] [68 12]]",
"",
},
}

// TestPronom looks to see if PRONOM identification results for a
// minimized PRONOM dataset are correct and contain the information we
// anticipate.
func TestPronom(t *testing.T) {
sf := siegfried.New()
config.SetHome(DataPath)
identifier, err := pronom.New(config.SetLimit(minimalPronom))
if err != nil {
t.Errorf("Error creating new PRONOM identifier: %s", err)
}
sf.Add(identifier)
makeSkeletons()
skeletonFS := fstest.MapFS(skeletons)
testDirListing, err := skeletonFS.ReadDir(".")
if err != nil {
t.Fatalf("Error reading test files directory: %s", err)
}
const resultLen int = 8
results := make([]pronomIdentificationTests, 0)
for _, val := range testDirListing {
testFilePath := filepath.Join(".", val.Name())
reader, _ := skeletonFS.Open(val.Name())
res, _ := sf.Identify(reader, testFilePath, "")
result := res[0].Values()
if len(result) != resultLen {
t.Errorf("Result len: %d not %d", len(result), resultLen)
}
idResult := pronomIdentificationTests{
result[0], // identifier
result[1], // PUID
result[2], // label
result[3], // version
result[4], // mime
result[5], // types
result[6], // details
result[7], // error
}
results = append(results, idResult)
}
// Sort expected results and received results to make them
// comparable.
sort.Slice(pronomIDs, func(i, j int) bool {
return pronomIDs[i].puid < pronomIDs[j].puid
})
sort.Slice(results, func(i, j int) bool {
return results[i].puid < results[j].puid
})
// Compare results on a result by result basis.
for idx, res := range results {
//t.Error(res)
if !reflect.DeepEqual(res, pronomIDs[idx]) {
t.Errorf("Results not equal for %s; expected %v; got %v", res.puid, pronomIDs[idx], res)
}
}
config.Clear()()
}
2 changes: 1 addition & 1 deletion cmd/sf/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ func handleIdentify(w http.ResponseWriter, r *http.Request, s *siegfried.Siegfri
err = identify(ctxts, path, "", coerr, nrec, d, gf)
wg.Wait()
wr.Tail()
if _, ok := err.(WalkError); ok { // only dump out walk errors, other errors reported in result
if _, ok := err.(walkError); ok { // only dump out walk errors, other errors reported in result
io.WriteString(w, err.Error())
}
}
Expand Down
10 changes: 5 additions & 5 deletions cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ var (
ctxPool *sync.Pool
)

type ModeError os.FileMode
type modeError os.FileMode

func (me ModeError) Error() string {
func (me modeError) Error() string {
typ := "unknown"
switch {
case os.FileMode(me)&os.ModeDir == os.ModeDir:
Expand All @@ -95,12 +95,12 @@ func (me ModeError) Error() string {
return fmt.Sprintf("file is of type %s; only regular files can be scanned", typ)
}

type WalkError struct {
type walkError struct {
path string
err error
}

func (we WalkError) Error() string {
func (we walkError) Error() string {
return fmt.Sprintf("[FATAL] file access error for %s: %v", we.path, we.err)
}

Expand Down Expand Up @@ -432,7 +432,7 @@ func main() {
case *jsono:
w = writer.JSON(os.Stdout)
case *droido:
if !*replay && (len(s.Fields()) != 1 || len(s.Fields()[0]) != 7) {
if !*replay && (len(s.Fields()) != 1 || len(s.Fields()[0]) < 7) {
close(ctxts)
log.Fatalln("[FATAL] DROID output is limited to signature files with a single PRONOM identifier")
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ func Clear() func() private {
return func() private {
identifier.name = ""
identifier.extend = nil
identifier.limit = nil
identifier.exclude = nil
loc.fdd = ""
mimeinfo.mi = ""
return private{}
Expand Down
22 changes: 19 additions & 3 deletions pkg/config/pronom.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var pronom = struct {
droid string // name of droid file e.g. DROID_SignatureFile_V78.xml
container string // e.g. container-signature-19770502.xml
reports string // directory where PRONOM reports are stored
noclass bool // omit class from the format info
doubleup bool // include byte signatures for formats that also have container signatures
extendc []string //container extensions
changesURL string
Expand Down Expand Up @@ -63,7 +64,7 @@ var pronom = struct {

// GETTERS

// DROID returns the location of the DROID signature file.
// Droid returns the location of the DROID signature file.
// If not set, infers the latest file.
func Droid() string {
if pronom.droid == "" {
Expand All @@ -79,7 +80,7 @@ func Droid() string {
return pronom.droid
}

// DROID base returns the base filename of the DROID signature file.
// DroidBase returns the base filename of the DROID signature file.
// If not set, infers the latest file.
func DroidBase() string {
if pronom.droid == "" {
Expand Down Expand Up @@ -163,6 +164,11 @@ func Reports() string {
return filepath.Join(siegfried.home, pronom.reports)
}

// NoClass reports whether the noclass flag has been set. This will cause class to be omitted from format infos
func NoClass() bool {
return pronom.noclass
}

// DoubleUp reports whether the doubleup flag has been set. This will cause byte signatures to be built for formats where container signatures are also provided.
func DoubleUp() bool {
return pronom.doubleup
Expand All @@ -173,11 +179,12 @@ func ExcludeDoubles(puids, cont []string) []string {
return exclude(puids, cont)
}

// Extend reports whether a set of container signature extensions has been provided.
// ExtendC reports whether a set of container signature extensions has been provided.
func ExtendC() []string {
return extensionPaths(pronom.extendc)
}

// ChangesURL returns the URL for the PRONOM release notes.
func ChangesURL() string {
return pronom.changesURL
}
Expand Down Expand Up @@ -225,6 +232,14 @@ func SetNoReports() func() private {
}
}

// SetNoClass causes class to be omitted from the format info
func SetNoClass() func() private {
return func() private {
pronom.noclass = true
return private{}
}
}

// SetDoubleUp causes byte signatures to be built for formats where container signatures are also provided.
func SetDoubleUp() func() private {
return func() private {
Expand All @@ -248,6 +263,7 @@ func SetHarvestTimeout(d time.Duration) {
pronom.harvestTimeout = d
}

// SetHarvestThrottle sets a throttle value for downloading DROID reports.
func SetHarvestThrottle(d time.Duration) {
pronom.harvestThrottle = d
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/mimeinfo/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ func (r *Recorder) Satisfied(mt core.MatcherType) (bool, core.Hint) {
sort.Sort(r.ids)
if len(r.ids) > 0 && (r.ids[0].xmlMatch || (r.ids[0].magicScore > 0 && r.ids[0].ID != config.TextMIME())) {
if mt == core.ByteMatcher {
return true, core.Hint{r.Start(mt), nil}
return true, core.Hint{Exclude: r.Start(mt), Pivot: nil}
}
return true, core.Hint{}
}
Expand Down
Loading

0 comments on commit 98516b1

Please sign in to comment.