Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PRONOM types to PRONOM identifier #209

Closed
wants to merge 10 commits into from
4 changes: 4 additions & 0 deletions cmd/roy/roy.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ var (
noxml = build.Bool("noxml", false, "skip XML matcher")
noriff = build.Bool("noriff", false, "skip RIFF matcher")
noreports = build.Bool("noreports", false, "build directly from DROID file rather than PRONOM reports")
noclass = build.Bool("noclass", false, "omit format classes from the signature file")
doubleup = build.Bool("doubleup", false, "include byte signatures for formats that also have container signatures")
rng = build.Int("range", config.Range(), "define a maximum range for segmentation")
distance = build.Int("distance", config.Distance(), "define a maximum distance for segmentation")
Expand Down Expand Up @@ -416,6 +417,9 @@ the DROID signature file you should also include a regular signature extension
if *noreports {
opts = append(opts, config.SetNoReports())
}
if *noclass {
opts = append(opts, config.SetNoClass())
}
if *doubleup {
opts = append(opts, config.SetDoubleUp())
}
Expand Down
6 changes: 3 additions & 3 deletions cmd/sf/longpath.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
}
if err != nil {
if coerr {
printFile(ctxts, gf(path, "", time.Time{}, 0), WalkError{path, err})
printFile(ctxts, gf(path, "", time.Time{}, 0), walkError{path, err})
return nil
}
return WalkError{path, err}
return walkError{path, err}
}
if info.IsDir() {
if norecurse && path != root {
Expand All @@ -50,7 +50,7 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
}
// zero user read permissions mask, octal 400 (decimal 256)
if !info.Mode().IsRegular() || info.Mode()&256 == 0 {
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), ModeError(info.Mode()))
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), modeError(info.Mode()))
return nil
}
identifyFile(gf(path, "", info.ModTime(), info.Size()), ctxts, gf)
Expand Down
6 changes: 3 additions & 3 deletions cmd/sf/longpath_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
info, err = retryStat(path, err) // retry stat in case is a windows long path error
if err != nil {
if coerr {
printFile(ctxts, gf(path, "", time.Time{}, 0), WalkError{path, err})
printFile(ctxts, gf(path, "", time.Time{}, 0), walkError{path, err})
return nil
}
return WalkError{path, err}
return walkError{path, err}
}
lp, sp = longpath(path), path
retry = true
Expand All @@ -107,7 +107,7 @@ func identify(ctxts chan *context, root, orig string, coerr, norecurse, droid bo
return nil
}
if !info.Mode().IsRegular() {
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), ModeError(info.Mode()))
printFile(ctxts, gf(path, "", info.ModTime(), info.Size()), modeError(info.Mode()))
return nil
}
identifyFile(gf(shortpath(path, orig), "", info.ModTime(), info.Size()), ctxts, gf)
Expand Down
186 changes: 186 additions & 0 deletions cmd/sf/pronom_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package main

import (
"encoding/hex"
"path/filepath"
"reflect"
"sort"
"testing"
"testing/fstest"

"github.com/richardlehane/siegfried"
"github.com/richardlehane/siegfried/pkg/config"
"github.com/richardlehane/siegfried/pkg/pronom"
)

var DataPath string = filepath.Join("..", "..", "cmd", "roy", "data")

// pronomIdentificationTests provides our structure for table driven tests.
type pronomIdentificationTests struct {
identiifer string
puid string
label string
version string
mime string
types string
details string
error string
}

var skeletons = make(map[string]*fstest.MapFile)

var minimalPronom = []string{"fmt/1", "fmt/3", "fmt/5", "fmt/11", "fmt/14"}

// Populate the global skeletons map from string-based byte-sequences to
// save having to store skeletons on disk and read from them.
func makeSkeletons() {
var files = make(map[string]string)
files["fmt-11-signature-id-58.png"] = "89504e470d0a1a0a0000000d494844520000000049454e44ae426082"
files["fmt-14-signature-id-123.pdf"] = "255044462d312e302525454f46"
files["fmt-1-signature-id-1032.wav"] = ("" +
"524946460000000057415645000000000000000000000000000000000000" +
"000062657874000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"000000000000000000000000000000000000000000000000000000000000" +
"00000000000000000000000000000000000000000000000000000000" +
"")
files["fmt-5-signature-id-51.avi"] = ("" +
"524946460000000041564920000000000000000000000000000000000000" +
"00004c495354000000006864726c61766968000000000000000000000000" +
"00000000000000004c495354000000006d6f7669" +
"")
files["fmt-3-signature-id-18.gif"] = "4749463837613b"
files["badf00d.unknown"] = "badf00d"
for key, val := range files {
data, _ := hex.DecodeString(val)
skeletons[key] = &fstest.MapFile{Data: []byte(data)}
}
}

var pronomIDs = []pronomIdentificationTests{
{
"pronom",
"UNKNOWN",
"",
"",
"",
"",
"",
"no match",
},
{
"pronom",
"fmt/1",
"Broadcast WAVE",
"0 Generic",
"audio/x-wav",
"Audio",
"extension match wav; byte match at [[0 12] [32 356]]",
"",
},
{
"pronom",
"fmt/11",
"Portable Network Graphics",
"1.0",
"image/png",
"Image (Raster)",
"extension match png; byte match at [[0 16] [16 12]]",
"",
},
{
"pronom",
"fmt/14",
"Acrobat PDF 1.0 - Portable Document Format",
"1.0",
"application/pdf",
"Page Description",
"extension match pdf; byte match at [[0 8] [8 5]]",
"",
},
{
"pronom",
"fmt/3",
"Graphics Interchange Format",
"87a",
"image/gif",
"Image (Raster)",
"extension match gif; byte match at [[0 6] [6 1]]",
"",
},
{
"pronom",
"fmt/5",
"Audio/Video Interleaved Format",
"",
"video/x-msvideo",
"Audio, Video",
"extension match avi; byte match at [[0 12] [32 16] [68 12]]",
"",
},
}

// TestPronom looks to see if PRONOM identification results for a
// minimized PRONOM dataset are correct and contain the information we
// anticipate.
func TestPronom(t *testing.T) {
sf := siegfried.New()
config.SetHome(DataPath)
identifier, err := pronom.New(config.SetLimit(minimalPronom))
if err != nil {
t.Errorf("Error creating new PRONOM identifier: %s", err)
}
sf.Add(identifier)
makeSkeletons()
skeletonFS := fstest.MapFS(skeletons)
testDirListing, err := skeletonFS.ReadDir(".")
if err != nil {
t.Fatalf("Error reading test files directory: %s", err)
}
const resultLen int = 8
results := make([]pronomIdentificationTests, 0)
for _, val := range testDirListing {
testFilePath := filepath.Join(".", val.Name())
reader, _ := skeletonFS.Open(val.Name())
res, _ := sf.Identify(reader, testFilePath, "")
result := res[0].Values()
if len(result) != resultLen {
t.Errorf("Result len: %d not %d", len(result), resultLen)
}
idResult := pronomIdentificationTests{
result[0], // identifier
result[1], // PUID
result[2], // label
result[3], // version
result[4], // mime
result[5], // types
result[6], // details
result[7], // error
}
results = append(results, idResult)
}
// Sort expected results and received results to make them
// comparable.
sort.Slice(pronomIDs, func(i, j int) bool {
return pronomIDs[i].puid < pronomIDs[j].puid
})
sort.Slice(results, func(i, j int) bool {
return results[i].puid < results[j].puid
})
// Compare results on a result by result basis.
for idx, res := range results {
//t.Error(res)
if !reflect.DeepEqual(res, pronomIDs[idx]) {
t.Errorf("Results not equal for %s; expected %v; got %v", res.puid, pronomIDs[idx], res)
}
}
config.Clear()()
}
2 changes: 1 addition & 1 deletion cmd/sf/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ func handleIdentify(w http.ResponseWriter, r *http.Request, s *siegfried.Siegfri
err = identify(ctxts, path, "", coerr, nrec, d, gf)
wg.Wait()
wr.Tail()
if _, ok := err.(WalkError); ok { // only dump out walk errors, other errors reported in result
if _, ok := err.(walkError); ok { // only dump out walk errors, other errors reported in result
io.WriteString(w, err.Error())
}
}
Expand Down
10 changes: 5 additions & 5 deletions cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ var (
ctxPool *sync.Pool
)

type ModeError os.FileMode
type modeError os.FileMode

func (me ModeError) Error() string {
func (me modeError) Error() string {
typ := "unknown"
switch {
case os.FileMode(me)&os.ModeDir == os.ModeDir:
Expand All @@ -95,12 +95,12 @@ func (me ModeError) Error() string {
return fmt.Sprintf("file is of type %s; only regular files can be scanned", typ)
}

type WalkError struct {
type walkError struct {
path string
err error
}

func (we WalkError) Error() string {
func (we walkError) Error() string {
return fmt.Sprintf("[FATAL] file access error for %s: %v", we.path, we.err)
}

Expand Down Expand Up @@ -432,7 +432,7 @@ func main() {
case *jsono:
w = writer.JSON(os.Stdout)
case *droido:
if !*replay && (len(s.Fields()) != 1 || len(s.Fields()[0]) != 7) {
if !*replay && (len(s.Fields()) != 1 || len(s.Fields()[0]) < 7) {
close(ctxts)
log.Fatalln("[FATAL] DROID output is limited to signature files with a single PRONOM identifier")
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ func Clear() func() private {
return func() private {
identifier.name = ""
identifier.extend = nil
identifier.limit = nil
identifier.exclude = nil
loc.fdd = ""
mimeinfo.mi = ""
return private{}
Expand Down
22 changes: 19 additions & 3 deletions pkg/config/pronom.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var pronom = struct {
droid string // name of droid file e.g. DROID_SignatureFile_V78.xml
container string // e.g. container-signature-19770502.xml
reports string // directory where PRONOM reports are stored
noclass bool // omit class from the format info
doubleup bool // include byte signatures for formats that also have container signatures
extendc []string //container extensions
changesURL string
Expand Down Expand Up @@ -63,7 +64,7 @@ var pronom = struct {

// GETTERS

// DROID returns the location of the DROID signature file.
// Droid returns the location of the DROID signature file.
// If not set, infers the latest file.
func Droid() string {
if pronom.droid == "" {
Expand All @@ -79,7 +80,7 @@ func Droid() string {
return pronom.droid
}

// DROID base returns the base filename of the DROID signature file.
// DroidBase returns the base filename of the DROID signature file.
// If not set, infers the latest file.
func DroidBase() string {
if pronom.droid == "" {
Expand Down Expand Up @@ -163,6 +164,11 @@ func Reports() string {
return filepath.Join(siegfried.home, pronom.reports)
}

// NoClass reports whether the noclass flag has been set. This will cause class to be omitted from format infos
func NoClass() bool {
return pronom.noclass
}

// DoubleUp reports whether the doubleup flag has been set. This will cause byte signatures to be built for formats where container signatures are also provided.
func DoubleUp() bool {
return pronom.doubleup
Expand All @@ -173,11 +179,12 @@ func ExcludeDoubles(puids, cont []string) []string {
return exclude(puids, cont)
}

// Extend reports whether a set of container signature extensions has been provided.
// ExtendC reports whether a set of container signature extensions has been provided.
func ExtendC() []string {
return extensionPaths(pronom.extendc)
}

// ChangesURL returns the URL for the PRONOM release notes.
func ChangesURL() string {
return pronom.changesURL
}
Expand Down Expand Up @@ -225,6 +232,14 @@ func SetNoReports() func() private {
}
}

// SetNoClass causes class to be omitted from the format info
func SetNoClass() func() private {
return func() private {
pronom.noclass = true
return private{}
}
}

// SetDoubleUp causes byte signatures to be built for formats where container signatures are also provided.
func SetDoubleUp() func() private {
return func() private {
Expand All @@ -248,6 +263,7 @@ func SetHarvestTimeout(d time.Duration) {
pronom.harvestTimeout = d
}

// SetHarvestThrottle sets a throttle value for downloading DROID reports.
func SetHarvestThrottle(d time.Duration) {
pronom.harvestThrottle = d
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/mimeinfo/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ func (r *Recorder) Satisfied(mt core.MatcherType) (bool, core.Hint) {
sort.Sort(r.ids)
if len(r.ids) > 0 && (r.ids[0].xmlMatch || (r.ids[0].magicScore > 0 && r.ids[0].ID != config.TextMIME())) {
if mt == core.ByteMatcher {
return true, core.Hint{r.Start(mt), nil}
return true, core.Hint{Exclude: r.Start(mt), Pivot: nil}
}
return true, core.Hint{}
}
Expand Down
Loading