From a1e4192c375190a3941f972d2bd8ad32d0d3947a Mon Sep 17 00:00:00 2001 From: Richard Lehane Date: Sat, 6 Dec 2014 12:35:44 +1100 Subject: [PATCH] works a bit --- README.md | 2 +- msoleps.go | 146 +++++++++-- msoleps_test.go | 13 +- property.go | 59 ++++- sets/sets.go | 8 - sets/summaryInformation.go | 27 -- types/currency.go | 14 + types/date.go | 14 + types/decimal.go | 14 + types/filetime.go | 14 + types/guid.go | 14 + types/numeric.go | 14 + types/strings.go | 520 +++++++++++++++++++------------------ types/types.go | 14 + 14 files changed, 551 insertions(+), 322 deletions(-) delete mode 100644 sets/sets.go delete mode 100644 sets/summaryInformation.go diff --git a/README.md b/README.md index b496900..abec33e 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,6 @@ Example usage: Install with `go get github.com/richardlehane/msoleps` -*I'm being developed and am not yet ready...* +*Status: currently works for simple property sets like SummaryInformation. Not all types implemented yet (e.g. Vector, Array). Property set bags not implemented yet* [![Build Status](https://travis-ci.org/richardlehane/msoleps.png?branch=master)](https://travis-ci.org/richardlehane/msoleps) \ No newline at end of file diff --git a/msoleps.go b/msoleps.go index 29b9e25..23e833c 100644 --- a/msoleps.go +++ b/msoleps.go @@ -1,3 +1,40 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package msoleps implements a reader for Microsoft OLE Property Set Data structures, +// (http://msdn.microsoft.com/en-au/library/dd942421.aspx) a generic persistence format +// for simple typed metadata + +// Example: +// file, _ := os.Open("test/test.doc") +// defer file.Close() +// doc, err := mscfb.NewReader(file) +// if err != nil { +// log.Fatal(err) +// } +// props := msoleps.New() +// for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { +// if msoleps.IsMSOLEPS(entry.Initial) { +// if oerr := props.Reset(doc); oerr != nil { +// log.Fatal(oerr) +// } +// for prop := range props.Property { +// fmt.Printf("Name: %s; Type: %s; Value: %v", prop.Name, prop.Type(), prop) +// } +// } +// } + package msoleps import ( @@ -6,7 +43,6 @@ import ( "errors" "io" - "github.com/richardlehane/msoleps/sets" "github.com/richardlehane/msoleps/types" ) @@ -16,7 +52,7 @@ var ( ErrSeek = errors.New("msoleps: can't seek backwards") ) -// check the first uint16 of an MSCFB name to see if this is a MSOLEPS stream +// IsMSOLEPS checks the first uint16 character of an mscfb name to test if it is a MSOLEPS stream func IsMSOLEPS(i uint16) bool { if i == 0x0005 { return true @@ -24,15 +60,14 @@ func IsMSOLEPS(i uint16) bool { return false } +// Reader is a reader for MS OLE Property Set Data structures type Reader struct { Property []*Property - CLSID types.Guid - SystemID uint32 - b *bytes.Buffer - buf []byte - pSetStream *propertySetStream - pSets [2]*propertySet + b *bytes.Buffer + buf []byte + *propertySetStream + pSets [2]*propertySet } func New() *Reader { @@ -58,24 +93,26 @@ func (r *Reader) start(rdr io.Reader) error { } r.buf = r.b.Bytes() // read the header (property stream details) - r.pSetStream = &propertySetStream{} - if err := binary.Read(r.b, binary.LittleEndian, r.pSetStream); err != nil { - return ErrRead + pss, err := makePropertySetStream(r.buf) + if err != nil { + return err } // sanity checks to find obvious errors switch { - case r.pSetStream.ByteOrder != 0xFFFE, r.pSetStream.Version > 0x0001, r.pSetStream.NumPropertySets > 0x00000002: + case pss.byteOrder != 0xFFFE, pss.version > 0x0001, pss.numPropertySets > 0x00000002: return ErrFormat } + r.propertySetStream = pss // identify the property identifiers and offsets - ps, err := r.getPropertySet(r.pSetStream.OffsetA) + ps, err := r.getPropertySet(pss.offsetA) if err != nil { return err } plen := len(ps.idsOffs) r.pSets[0] = ps - if r.pSetStream.NumPropertySets == 2 { - psb, err := r.getPropertySet(r.pSetStream.OffsetB) + var psb *propertySet + if pss.numPropertySets == 2 { + psb, err = r.getPropertySet(pss.offsetB) if err != nil { return err } @@ -83,10 +120,8 @@ func (r *Reader) start(rdr io.Reader) error { plen += len(psb.idsOffs) } r.Property = make([]*Property, plen) - var dict map[uint32]string - if r.pSetStream.FmtidA == types.MustGuidFromString("{F29F85E0-4FF9-1068-AB91-08002B27B3D9}") { - dict = sets.SummaryInformation.Dict - } else { + dict, ok := propertySets[pss.fmtidA] + if !ok { dict = ps.dict if dict == nil { dict = make(map[uint32]string) @@ -95,7 +130,7 @@ func (r *Reader) start(rdr io.Reader) error { for i, v := range ps.idsOffs { r.Property[i] = &Property{} r.Property[i].Name = dict[v.id] - t, _ := types.Evaluate(r.buf[int(v.offset+r.pSetStream.OffsetA):]) + t, _ := types.Evaluate(r.buf[int(v.offset+pss.offsetA):]) if t.Type() == "CodeString" { cs := t.(*types.CodeString) cs.SetId(ps.code) @@ -103,6 +138,28 @@ func (r *Reader) start(rdr io.Reader) error { } r.Property[i].T = t } + if pss.numPropertySets != 2 { + return nil + } + dict, ok = propertySets[pss.fmtidB] + if !ok { + dict = psb.dict + if dict == nil { + dict = make(map[uint32]string) + } + } + for i, v := range psb.idsOffs { + i += len(ps.idsOffs) + r.Property[i] = &Property{} + r.Property[i].Name = dict[v.id] + t, _ := types.Evaluate(r.buf[int(v.offset+pss.offsetB):]) + if t.Type() == "CodeString" { + cs := t.(*types.CodeString) + cs.SetId(psb.code) + t = types.Type(cs) + } + r.Property[i].T = t + } return nil } @@ -125,7 +182,54 @@ func (r *Reader) getPropertySet(o uint32) (*propertySet, error) { } } if dictOff > 0 { - dictOff++ // just letting it compile - unfinished bit + var err error + pSet.dict, err = r.getDictionary(dictOff+o, pSet.code) + if err != nil { + return nil, err + } } return pSet, nil } + +func (r *Reader) getDictionary(o uint32, code types.CodePageID) (map[uint32]string, error) { + b := r.buf[int(o):] + e := 4 + if len(b) < e { + return nil, ErrFormat + } + num := int(binary.LittleEndian.Uint32(b[:e])) + if num == 0 { + return nil, nil + } + dict := make(map[uint32]string) + for i := 0; i < num; i++ { + if len(b[e:]) < 8 { + return nil, ErrFormat + } + id, l := binary.LittleEndian.Uint32(b[e:e+4]), binary.LittleEndian.Uint32(b[e+4:e+8]) + var s types.Type + var err error + if code == 0x04B0 { + var pad int + if l%2 != 0 { + pad = 2 + } + s, err = types.MakeUnicode(b[e+4:]) + if err != nil { + return nil, ErrFormat + } + e = e + 8 + pad + int(l)*2 + } else { + s, err = types.MakeCodeString(b[e+4:]) + if err != nil { + return nil, ErrFormat + } + cs := s.(*types.CodeString) + cs.SetId((code)) + s = cs + e = e + 8 + int(l) + } + dict[id] = s.String() + } + return dict, nil +} diff --git a/msoleps_test.go b/msoleps_test.go index 1d452b0..808e809 100644 --- a/msoleps_test.go +++ b/msoleps_test.go @@ -1,7 +1,6 @@ package msoleps import ( - "fmt" "os" "testing" ) @@ -19,9 +18,6 @@ func testFile(t *testing.T, path string) *Reader { if err != nil { t.Errorf("Error opening file; Returns error: ", err) } - for _, prop := range doc.Property { - fmt.Printf("%s: %s\n", prop.Name, prop) - } return doc } @@ -30,6 +26,9 @@ func TestDocSum(t *testing.T) { if len(doc.Property) != 12 { t.Error("Expecting 12 properties, got %d", len(doc.Property)) } + if doc.Property[1].String() != "Australian Broadcasting Corporation" { + t.Errorf("Expecting 'ABC' as second property, got %s", doc.Property[1]) + } } func TestSum(t *testing.T) { @@ -37,6 +36,9 @@ func TestSum(t *testing.T) { if len(doc.Property) != 17 { t.Error("Expecting 17 properties, got %d", len(doc.Property)) } + if doc.Property[5].String() != "Normal" { + t.Errorf("Expecting 'Normal' as sixth property, got %s", doc.Property[5]) + } } func TestSum1(t *testing.T) { @@ -44,4 +46,7 @@ func TestSum1(t *testing.T) { if len(doc.Property) != 3 { t.Error("Expecting 3 properties, got %d", len(doc.Property)) } + if doc.Property[0].String() != "Mail" { + t.Errorf("Expecting 'Mail' as first property, got %s", doc.Property[0]) + } } diff --git a/property.go b/property.go index c16aaa7..c1be961 100644 --- a/property.go +++ b/property.go @@ -1,6 +1,23 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package msoleps -import "github.com/richardlehane/msoleps/types" +import ( + "encoding/binary" + "github.com/richardlehane/msoleps/types" +) type Property struct { Name string @@ -16,15 +33,41 @@ func (p *Property) Type() string { } type propertySetStream struct { - ByteOrder uint16 - Version uint16 + byteOrder uint16 + version uint16 SystemID uint32 CLSID types.Guid - NumPropertySets uint32 - FmtidA types.Guid - OffsetA uint32 - FmtidB types.Guid - OffsetB uint32 + numPropertySets uint32 + fmtidA types.Guid + offsetA uint32 + fmtidB types.Guid // This can be absent (i.e. not null) + offsetB uint32 +} + +func makePropertySetStream(b []byte) (*propertySetStream, error) { + if len(b) < 48 { + return nil, ErrFormat + } + ps := &propertySetStream{} + ps.byteOrder = binary.LittleEndian.Uint16(b[:2]) + ps.version = binary.LittleEndian.Uint16(b[2:4]) + ps.SystemID = binary.LittleEndian.Uint32(b[4:8]) + g, _ := types.MakeGuid(b[8:]) + ps.CLSID = g.(types.Guid) + ps.numPropertySets = binary.LittleEndian.Uint32(b[24:28]) + g, _ = types.MakeGuid(b[28:]) + ps.fmtidA, _ = g.(types.Guid) + ps.offsetA = binary.LittleEndian.Uint32(b[44:48]) + if ps.numPropertySets != 2 { + return ps, nil + } + if len(b) < 68 { + return nil, ErrFormat + } + g, _ = types.MakeGuid(b[48:]) + ps.fmtidB = g.(types.Guid) + ps.offsetB = binary.LittleEndian.Uint32(b[64:68]) + return ps, nil } type propertySet struct { diff --git a/sets/sets.go b/sets/sets.go deleted file mode 100644 index 2f71318..0000000 --- a/sets/sets.go +++ /dev/null @@ -1,8 +0,0 @@ -package sets - -import "github.com/richardlehane/msoleps/types" - -type PropertySetDef struct { - FMTID types.Guid - Dict map[uint32]string -} diff --git a/sets/summaryInformation.go b/sets/summaryInformation.go deleted file mode 100644 index b741c40..0000000 --- a/sets/summaryInformation.go +++ /dev/null @@ -1,27 +0,0 @@ -package sets - -import "github.com/richardlehane/msoleps/types" - -var SummaryInformation = PropertySetDef{ - types.MustGuidFromString("{F29F85E0-4FF9-1068-AB91-08002B27B3D9}"), - map[uint32]string{ - 0x00000002: "Title", - 0x00000003: "Subject", - 0x00000004: "Author", - 0x00000005: "Keywords", - 0x00000006: "Comments", - 0x00000007: "Template", - 0x00000008: "LastAuthor", - 0x00000009: "RevNumber", - 0x0000000A: "EditTime", - 0x0000000B: "LastPrinted", - 0x0000000C: "CreateTime", - 0x0000000D: "LastSaveTime", - 0x0000000E: "PageCount", - 0x0000000F: "WordCount", - 0x00000010: "CharCount", - 0x00000011: "Thumbnail", - 0x00000012: "AppName", - 0x00000013: "DocSecurity", - }, -} diff --git a/types/currency.go b/types/currency.go index 9f0b646..9be65b2 100644 --- a/types/currency.go +++ b/types/currency.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import ( diff --git a/types/date.go b/types/date.go index 6adca11..d161334 100644 --- a/types/date.go +++ b/types/date.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import ( diff --git a/types/decimal.go b/types/decimal.go index ab9ce6f..937129f 100644 --- a/types/decimal.go +++ b/types/decimal.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import ( diff --git a/types/filetime.go b/types/filetime.go index 120c4ac..34fc310 100644 --- a/types/filetime.go +++ b/types/filetime.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import ( diff --git a/types/guid.go b/types/guid.go index e6d5ba5..e43efb2 100644 --- a/types/guid.go +++ b/types/guid.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import ( diff --git a/types/numeric.go b/types/numeric.go index 62233b2..246b765 100644 --- a/types/numeric.go +++ b/types/numeric.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import ( diff --git a/types/strings.go b/types/strings.go index 951750b..e74b54c 100644 --- a/types/strings.go +++ b/types/strings.go @@ -1,253 +1,267 @@ -package types - -import ( - "encoding/binary" - "strings" - "unicode/utf16" -) - -func nullTerminated(s string) string { - return s[:strings.Index(s, "\x00")] -} - -type UnicodeString struct { - Length uint32 - Chars []uint16 -} - -func (s UnicodeString) Type() string { - return "UnicodeString" -} - -func (s UnicodeString) String() string { - if len(s.Chars) == 0 { - return "" - } - return nullTerminated(string(utf16.Decode(s.Chars))) -} - -func MakeUnicode(b []byte) (Type, error) { - if len(b) < 4 { - return UnicodeString{}, ErrType - } - s := UnicodeString{} - s.Length = binary.LittleEndian.Uint32(b[:4]) - if s.Length == 0 { - return s, nil - } - if len(b) < int(s.Length)+4 { - return UnicodeString{}, ErrType - } - s.Chars = make([]uint16, int(s.Length)) - for i := range s.Chars { - start := i*2 + 4 - s.Chars[i] = binary.LittleEndian.Uint16(b[start : start+2]) - } - return s, nil -} - -type CodeString struct { - id CodePageID - Length uint32 - Chars []byte -} - -func (s *CodeString) SetId(i CodePageID) { - s.id = i -} - -func (s *CodeString) Encoding() string { - return CodePageIDs[s.id] -} - -func (s *CodeString) Type() string { - return "CodeString" -} - -func (s *CodeString) String() string { - if len(s.Chars) == 0 { - return "" - } - if s.id == 1200 { - chars := make([]uint16, len(s.Chars)/2) - for i := range chars { - chars[i] = binary.LittleEndian.Uint16(s.Chars[i*2 : i*2+2]) - } - return nullTerminated(string(utf16.Decode(chars))) - } - return nullTerminated(string(s.Chars)) -} - -func MakeCodeString(b []byte) (Type, error) { - if len(b) < 4 { - return &CodeString{}, ErrType - } - s := &CodeString{} - s.Length = binary.LittleEndian.Uint32(b[:4]) - if s.Length == 0 { - return s, nil - } - if len(b) < int(s.Length)+4 { - return s, ErrType - } - s.Chars = make([]byte, int(s.Length)) - copy(s.Chars, b[4:int(s.Length)+4]) - return s, nil -} - -type CodePageID uint16 - -var CodePageIDs map[CodePageID]string = map[CodePageID]string{ - 37: "IBM037 - IBM EBCDIC US-Canada", - 437: "IBM437 - OEM United States", - 500: "IBM500 - IBM EBCDIC International", - 708: "ASMO-708 - Arabic (ASMO 708)", - 709: "Arabic (ASMO-449+, BCON V4)", - 710: "Arabic - Transparent Arabic", - 720: "DOS-720 - Arabic (Transparent ASMO); Arabic (DOS)", - 737: "ibm737 - OEM Greek (formerly 437G); Greek (DOS)", - 775: "ibm775 - OEM Baltic; Baltic (DOS)", - 850: "ibm850 - OEM Multilingual Latin 1; Western European (DOS)", - 852: "ibm852 - OEM Latin 2; Central European (DOS)", - 855: "IBM855 - OEM Cyrillic (primarily Russian)", - 857: "ibm857 - OEM Turkish; Turkish (DOS)", - 858: "IBM00858 - OEM Multilingual Latin 1 + Euro symbol", - 860: "IBM860 - OEM Portuguese; Portuguese (DOS)", - 861: "ibm861 - OEM Icelandic; Icelandic (DOS)", - 862: "DOS-862 - OEM Hebrew; Hebrew (DOS)", - 863: "IBM863 - OEM French Canadian; French Canadian (DOS)", - 864: "IBM864 - OEM Arabic; Arabic (864)", - 865: "IBM865 - OEM Nordic; Nordic (DOS)", - 866: "cp866 - OEM Russian; Cyrillic (DOS)", - 869: "ibm869 - OEM Modern Greek; Greek, Modern (DOS)", - 870: "IBM870 - IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2", - 874: "windows-874 - ANSI/OEM Thai (ISO 8859-11); Thai (Windows)", - 875: "cp875 - IBM EBCDIC Greek Modern", - 932: "shift_jis - ANSI/OEM Japanese; Japanese (Shift-JIS)", - 936: "gb2312 - ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)", - 949: "ks_c_5601-1987 - ANSI/OEM Korean (Unified Hangul Code)", - 950: "big5 - ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)", - 1026: "IBM1026 - IBM EBCDIC Turkish (Latin 5)", - 1047: "IBM01047 - BM EBCDIC Latin 1/Open System", - 1140: "IBM01140 - IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)", - 1141: "IBM01141 - IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)", - 1142: "IBM01142 - IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)", - 1143: "IBM01143 - IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)", - 1144: "IBM01144 - IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)", - 1145: "IBM01145 - IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)", - 1146: "IBM01146 - IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)", - 1147: "IBM01147 - IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)", - 1148: "IBM01148 - IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)", - 1149: "IBM01149 - IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)", - 1200: "utf-16 - Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications", - 1201: "unicodeFFFE - Unicode UTF-16, big endian byte order; available only to managed applications", - 1250: "windows-1250 - ANSI Central European; Central European (Windows)", - 1251: "windows-1251 - ANSI Cyrillic; Cyrillic (Windows)", - 1252: "windows-1252 - ANSI Latin 1; Western European (Windows)", - 1253: "windows-1253 - ANSI Greek; Greek (Windows)", - 1254: "windows-1254 - ANSI Turkish; Turkish (Windows)", - 1255: "windows-1255 - ANSI Hebrew; Hebrew (Windows)", - 1256: "windows-1256 - ANSI Arabic; Arabic (Windows)", - 1257: "windows-1257 - ANSI Baltic; Baltic (Windows)", - 1258: "windows-1258 - ANSI/OEM Vietnamese; Vietnamese (Windows)", - 1361: "Johab - Korean (Johab)", - 10000: "macintosh - MAC Roman; Western European (Mac)", - 10001: "x-mac-japanese - Japanese (Mac)", - 10002: "x-mac-chinesetrad - MAC Traditional Chinese (Big5); Chinese Traditional (Mac)", - 10003: "x-mac-korean - Korean (Mac)", - 10004: "x-mac-arabic - Arabic (Mac)", - 10005: "x-mac-hebrew - Hebrew (Mac)", - 10006: "x-mac-greek - Greek (Mac)", - 10007: "x-mac-cyrillic - Cyrillic (Mac)", - 10008: "x-mac-chinesesimp - MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)", - 10010: "x-mac-romanian - Romanian (Mac)", - 10017: "x-mac-ukrainian - Ukrainian (Mac)", - 10021: "x-mac-thai - Thai (Mac)", - 10029: "x-mac-ce - MAC Latin 2; Central European (Mac)", - 10079: "x-mac-icelandic - Icelandic (Mac)", - 10081: "x-mac-turkish - Turkish (Mac)", - 10082: "x-mac-croatian - Croatian (Mac)", - 12000: "utf-32 - Unicode UTF-32, little endian byte order; available only to managed applications", - 12001: "utf-32BE - Unicode UTF-32, big endian byte order; available only to managed applications", - 20000: "x-Chinese_CNS - CNS Taiwan; Chinese Traditional (CNS)", - 20001: "x-cp20001 - TCA Taiwan", - 20002: "x_Chinese-Eten - Eten Taiwan; Chinese Traditional (Eten)", - 20003: "x-cp20003 - IBM5550 Taiwan", - 20004: "x-cp20004 - TeleText Taiwan", - 20005: "x-cp20005 - Wang Taiwan", - 20105: "x-IA5 - IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)", - 20106: "x-IA5-German - IA5 German (7-bit)", - 20107: "x-IA5-Swedish - IA5 Swedish (7-bit)", - 20108: "x-IA5-Norwegian - IA5 Norwegian (7-bit)", - 20127: "us-ascii - US-ASCII (7-bit)", - 20261: "x-cp20261 - T.61", - 20269: "x-cp20269 - ISO 6937 Non-Spacing Accent", - 20273: "IBM273 - IBM EBCDIC Germany", - 20277: "IBM277 - IBM EBCDIC Denmark-Norway", - 20278: "IBM278 - IBM EBCDIC Finland-Sweden", - 20280: "IBM280 - IBM EBCDIC Italy", - 20284: "IBM284 - IBM EBCDIC Latin America-Spain", - 20285: "IBM285 - IBM EBCDIC United Kingdom", - 20290: "IBM290 - IBM EBCDIC Japanese Katakana Extended", - 20297: "IBM297 - IBM EBCDIC France", - 20420: "IBM420 - IBM EBCDIC Arabic", - 20423: "IBM423 - IBM EBCDIC Greek", - 20424: "IBM424 - IBM EBCDIC Hebrew", - 20833: "x-EBCDIC-KoreanExtended - IBM EBCDIC Korean Extended", - 20838: "IBM-Thai - IBM EBCDIC Thai", - 20866: "koi8-r - Russian (KOI8-R); Cyrillic (KOI8-R)", - 20871: "IBM871 - IBM EBCDIC Icelandic", - 20880: "IBM880 - IBM EBCDIC Cyrillic Russian", - 20905: "IBM905 - IBM EBCDIC Turkish", - 20924: "IBM00924 - IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)", - 20932: "EUC-JP - Japanese (JIS 0208-1990 and 0212-1990)", - 20936: "x-cp20936 - Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)", - 20949: "x-cp20949 - Korean Wansung", - 21025: "cp1025 - IBM EBCDIC Cyrillic Serbian-Bulgarian", - 21027: "(deprecated)", - 21866: "koi8-u - Ukrainian (KOI8-U); Cyrillic (KOI8-U)", - 28591: "iso-8859-1 - ISO 8859-1 Latin 1; Western European (ISO)", - 28592: "iso-8859-2 - ISO 8859-2 Central European; Central European (ISO)", - 28593: "iso-8859-3 - ISO 8859-3 Latin 3", - 28594: "iso-8859-4 - ISO 8859-4 Baltic", - 28595: "iso-8859-5 - ISO 8859-5 Cyrillic", - 28596: "iso-8859-6 - ISO 8859-6 Arabic", - 28597: "iso-8859-7 - ISO 8859-7 Greek", - 28598: "iso-8859-8 - ISO 8859-8 Hebrew; Hebrew (ISO-Visual)", - 28599: "iso-8859-9 - ISO 8859-9 Turkish", - 28603: "iso-8859-13 - ISO 8859-13 Estonian", - 28605: "iso-8859-15 - ISO 8859-15 Latin 9", - 29001: "x-Europa - Europa 3", - 38598: "iso-8859-8-i - ISO 8859-8 Hebrew; Hebrew (ISO-Logical)", - 50220: "iso-2022-jp - ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)", - 50221: "csISO2022JP - ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)", - 50222: "iso-2022-jp - ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)", - 50225: "iso-2022-kr - ISO 2022 Korean", - 50227: "x-cp50227 - ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)", - 50229: "ISO 2022 - Traditional Chinese", - 50930: "EBCDIC - Japanese (Katakana) Extended", - 50931: "EBCDIC - US-Canada and Japanese", - 50933: "EBCDIC - Korean Extended and Korean", - 50935: "EBCDIC - Simplified Chinese Extended and Simplified Chinese", - 50936: "EBCDIC - Simplified Chinese", - 50937: "EBCDIC - US-Canada and Traditional Chinese", - 50939: "EBCDIC - Japanese (Latin) Extended and Japanese", - 51932: "euc-jp - EUC Japanese", - 51936: "EUC-CN - EUC Simplified Chinese; Chinese Simplified (EUC)", - 51949: "euc-kr - EUC Korean", - 51950: "EUC - Traditional Chinese", - 52936: "hz-gb-2312 - HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)", - 54936: "GB18030 - Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)", - 57002: "x-iscii-de - ISCII Devanagari", - 57003: "x-iscii-be - ISCII Bengali", - 57004: "x-iscii-ta - ISCII Tamil", - 57005: "x-iscii-te - ISCII Telugu", - 57006: "x-iscii-as - ISCII Assamese", - 57007: "x-iscii-or - ISCII Oriya", - 57008: "x-iscii-ka - ISCII Kannada", - 57009: "x-iscii-ma - ISCII Malayalam", - 57010: "x-iscii-gu - ISCII Gujarati", - 57011: "x-iscii-pa - ISCII Punjabi", - 65000: "utf-7 - Unicode (UTF-7)", - 65001: "utf-8 - Unicode (UTF-8)", -} +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package types + +import ( + "encoding/binary" + "strings" + "unicode/utf16" +) + +func nullTerminated(s string) string { + return s[:strings.Index(s, "\x00")] +} + +type UnicodeString struct { + Length uint32 + Chars []uint16 +} + +func (s UnicodeString) Type() string { + return "UnicodeString" +} + +func (s UnicodeString) String() string { + if len(s.Chars) == 0 { + return "" + } + return nullTerminated(string(utf16.Decode(s.Chars))) +} + +func MakeUnicode(b []byte) (Type, error) { + if len(b) < 4 { + return UnicodeString{}, ErrType + } + s := UnicodeString{} + s.Length = binary.LittleEndian.Uint32(b[:4]) + if s.Length == 0 { + return s, nil + } + if len(b) < int(s.Length)+4 { + return UnicodeString{}, ErrType + } + s.Chars = make([]uint16, int(s.Length)) + for i := range s.Chars { + start := i*2 + 4 + s.Chars[i] = binary.LittleEndian.Uint16(b[start : start+2]) + } + return s, nil +} + +type CodeString struct { + id CodePageID + Length uint32 + Chars []byte +} + +func (s *CodeString) SetId(i CodePageID) { + s.id = i +} + +func (s *CodeString) Encoding() string { + return CodePageIDs[s.id] +} + +func (s *CodeString) Type() string { + return "CodeString" +} + +func (s *CodeString) String() string { + if len(s.Chars) == 0 { + return "" + } + if s.id == 1200 { + chars := make([]uint16, len(s.Chars)/2) + for i := range chars { + chars[i] = binary.LittleEndian.Uint16(s.Chars[i*2 : i*2+2]) + } + return nullTerminated(string(utf16.Decode(chars))) + } + return nullTerminated(string(s.Chars)) +} + +func MakeCodeString(b []byte) (Type, error) { + if len(b) < 4 { + return &CodeString{}, ErrType + } + s := &CodeString{} + s.Length = binary.LittleEndian.Uint32(b[:4]) + if s.Length == 0 { + return s, nil + } + if len(b) < int(s.Length)+4 { + return s, ErrType + } + s.Chars = make([]byte, int(s.Length)) + copy(s.Chars, b[4:int(s.Length)+4]) + return s, nil +} + +type CodePageID uint16 + +var CodePageIDs map[CodePageID]string = map[CodePageID]string{ + 37: "IBM037 - IBM EBCDIC US-Canada", + 437: "IBM437 - OEM United States", + 500: "IBM500 - IBM EBCDIC International", + 708: "ASMO-708 - Arabic (ASMO 708)", + 709: "Arabic (ASMO-449+, BCON V4)", + 710: "Arabic - Transparent Arabic", + 720: "DOS-720 - Arabic (Transparent ASMO); Arabic (DOS)", + 737: "ibm737 - OEM Greek (formerly 437G); Greek (DOS)", + 775: "ibm775 - OEM Baltic; Baltic (DOS)", + 850: "ibm850 - OEM Multilingual Latin 1; Western European (DOS)", + 852: "ibm852 - OEM Latin 2; Central European (DOS)", + 855: "IBM855 - OEM Cyrillic (primarily Russian)", + 857: "ibm857 - OEM Turkish; Turkish (DOS)", + 858: "IBM00858 - OEM Multilingual Latin 1 + Euro symbol", + 860: "IBM860 - OEM Portuguese; Portuguese (DOS)", + 861: "ibm861 - OEM Icelandic; Icelandic (DOS)", + 862: "DOS-862 - OEM Hebrew; Hebrew (DOS)", + 863: "IBM863 - OEM French Canadian; French Canadian (DOS)", + 864: "IBM864 - OEM Arabic; Arabic (864)", + 865: "IBM865 - OEM Nordic; Nordic (DOS)", + 866: "cp866 - OEM Russian; Cyrillic (DOS)", + 869: "ibm869 - OEM Modern Greek; Greek, Modern (DOS)", + 870: "IBM870 - IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2", + 874: "windows-874 - ANSI/OEM Thai (ISO 8859-11); Thai (Windows)", + 875: "cp875 - IBM EBCDIC Greek Modern", + 932: "shift_jis - ANSI/OEM Japanese; Japanese (Shift-JIS)", + 936: "gb2312 - ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)", + 949: "ks_c_5601-1987 - ANSI/OEM Korean (Unified Hangul Code)", + 950: "big5 - ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)", + 1026: "IBM1026 - IBM EBCDIC Turkish (Latin 5)", + 1047: "IBM01047 - BM EBCDIC Latin 1/Open System", + 1140: "IBM01140 - IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)", + 1141: "IBM01141 - IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)", + 1142: "IBM01142 - IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)", + 1143: "IBM01143 - IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)", + 1144: "IBM01144 - IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)", + 1145: "IBM01145 - IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)", + 1146: "IBM01146 - IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)", + 1147: "IBM01147 - IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)", + 1148: "IBM01148 - IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)", + 1149: "IBM01149 - IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)", + 1200: "utf-16 - Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications", + 1201: "unicodeFFFE - Unicode UTF-16, big endian byte order; available only to managed applications", + 1250: "windows-1250 - ANSI Central European; Central European (Windows)", + 1251: "windows-1251 - ANSI Cyrillic; Cyrillic (Windows)", + 1252: "windows-1252 - ANSI Latin 1; Western European (Windows)", + 1253: "windows-1253 - ANSI Greek; Greek (Windows)", + 1254: "windows-1254 - ANSI Turkish; Turkish (Windows)", + 1255: "windows-1255 - ANSI Hebrew; Hebrew (Windows)", + 1256: "windows-1256 - ANSI Arabic; Arabic (Windows)", + 1257: "windows-1257 - ANSI Baltic; Baltic (Windows)", + 1258: "windows-1258 - ANSI/OEM Vietnamese; Vietnamese (Windows)", + 1361: "Johab - Korean (Johab)", + 10000: "macintosh - MAC Roman; Western European (Mac)", + 10001: "x-mac-japanese - Japanese (Mac)", + 10002: "x-mac-chinesetrad - MAC Traditional Chinese (Big5); Chinese Traditional (Mac)", + 10003: "x-mac-korean - Korean (Mac)", + 10004: "x-mac-arabic - Arabic (Mac)", + 10005: "x-mac-hebrew - Hebrew (Mac)", + 10006: "x-mac-greek - Greek (Mac)", + 10007: "x-mac-cyrillic - Cyrillic (Mac)", + 10008: "x-mac-chinesesimp - MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)", + 10010: "x-mac-romanian - Romanian (Mac)", + 10017: "x-mac-ukrainian - Ukrainian (Mac)", + 10021: "x-mac-thai - Thai (Mac)", + 10029: "x-mac-ce - MAC Latin 2; Central European (Mac)", + 10079: "x-mac-icelandic - Icelandic (Mac)", + 10081: "x-mac-turkish - Turkish (Mac)", + 10082: "x-mac-croatian - Croatian (Mac)", + 12000: "utf-32 - Unicode UTF-32, little endian byte order; available only to managed applications", + 12001: "utf-32BE - Unicode UTF-32, big endian byte order; available only to managed applications", + 20000: "x-Chinese_CNS - CNS Taiwan; Chinese Traditional (CNS)", + 20001: "x-cp20001 - TCA Taiwan", + 20002: "x_Chinese-Eten - Eten Taiwan; Chinese Traditional (Eten)", + 20003: "x-cp20003 - IBM5550 Taiwan", + 20004: "x-cp20004 - TeleText Taiwan", + 20005: "x-cp20005 - Wang Taiwan", + 20105: "x-IA5 - IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)", + 20106: "x-IA5-German - IA5 German (7-bit)", + 20107: "x-IA5-Swedish - IA5 Swedish (7-bit)", + 20108: "x-IA5-Norwegian - IA5 Norwegian (7-bit)", + 20127: "us-ascii - US-ASCII (7-bit)", + 20261: "x-cp20261 - T.61", + 20269: "x-cp20269 - ISO 6937 Non-Spacing Accent", + 20273: "IBM273 - IBM EBCDIC Germany", + 20277: "IBM277 - IBM EBCDIC Denmark-Norway", + 20278: "IBM278 - IBM EBCDIC Finland-Sweden", + 20280: "IBM280 - IBM EBCDIC Italy", + 20284: "IBM284 - IBM EBCDIC Latin America-Spain", + 20285: "IBM285 - IBM EBCDIC United Kingdom", + 20290: "IBM290 - IBM EBCDIC Japanese Katakana Extended", + 20297: "IBM297 - IBM EBCDIC France", + 20420: "IBM420 - IBM EBCDIC Arabic", + 20423: "IBM423 - IBM EBCDIC Greek", + 20424: "IBM424 - IBM EBCDIC Hebrew", + 20833: "x-EBCDIC-KoreanExtended - IBM EBCDIC Korean Extended", + 20838: "IBM-Thai - IBM EBCDIC Thai", + 20866: "koi8-r - Russian (KOI8-R); Cyrillic (KOI8-R)", + 20871: "IBM871 - IBM EBCDIC Icelandic", + 20880: "IBM880 - IBM EBCDIC Cyrillic Russian", + 20905: "IBM905 - IBM EBCDIC Turkish", + 20924: "IBM00924 - IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)", + 20932: "EUC-JP - Japanese (JIS 0208-1990 and 0212-1990)", + 20936: "x-cp20936 - Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)", + 20949: "x-cp20949 - Korean Wansung", + 21025: "cp1025 - IBM EBCDIC Cyrillic Serbian-Bulgarian", + 21027: "(deprecated)", + 21866: "koi8-u - Ukrainian (KOI8-U); Cyrillic (KOI8-U)", + 28591: "iso-8859-1 - ISO 8859-1 Latin 1; Western European (ISO)", + 28592: "iso-8859-2 - ISO 8859-2 Central European; Central European (ISO)", + 28593: "iso-8859-3 - ISO 8859-3 Latin 3", + 28594: "iso-8859-4 - ISO 8859-4 Baltic", + 28595: "iso-8859-5 - ISO 8859-5 Cyrillic", + 28596: "iso-8859-6 - ISO 8859-6 Arabic", + 28597: "iso-8859-7 - ISO 8859-7 Greek", + 28598: "iso-8859-8 - ISO 8859-8 Hebrew; Hebrew (ISO-Visual)", + 28599: "iso-8859-9 - ISO 8859-9 Turkish", + 28603: "iso-8859-13 - ISO 8859-13 Estonian", + 28605: "iso-8859-15 - ISO 8859-15 Latin 9", + 29001: "x-Europa - Europa 3", + 38598: "iso-8859-8-i - ISO 8859-8 Hebrew; Hebrew (ISO-Logical)", + 50220: "iso-2022-jp - ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)", + 50221: "csISO2022JP - ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)", + 50222: "iso-2022-jp - ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)", + 50225: "iso-2022-kr - ISO 2022 Korean", + 50227: "x-cp50227 - ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)", + 50229: "ISO 2022 - Traditional Chinese", + 50930: "EBCDIC - Japanese (Katakana) Extended", + 50931: "EBCDIC - US-Canada and Japanese", + 50933: "EBCDIC - Korean Extended and Korean", + 50935: "EBCDIC - Simplified Chinese Extended and Simplified Chinese", + 50936: "EBCDIC - Simplified Chinese", + 50937: "EBCDIC - US-Canada and Traditional Chinese", + 50939: "EBCDIC - Japanese (Latin) Extended and Japanese", + 51932: "euc-jp - EUC Japanese", + 51936: "EUC-CN - EUC Simplified Chinese; Chinese Simplified (EUC)", + 51949: "euc-kr - EUC Korean", + 51950: "EUC - Traditional Chinese", + 52936: "hz-gb-2312 - HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)", + 54936: "GB18030 - Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)", + 57002: "x-iscii-de - ISCII Devanagari", + 57003: "x-iscii-be - ISCII Bengali", + 57004: "x-iscii-ta - ISCII Tamil", + 57005: "x-iscii-te - ISCII Telugu", + 57006: "x-iscii-as - ISCII Assamese", + 57007: "x-iscii-or - ISCII Oriya", + 57008: "x-iscii-ka - ISCII Kannada", + 57009: "x-iscii-ma - ISCII Malayalam", + 57010: "x-iscii-gu - ISCII Gujarati", + 57011: "x-iscii-pa - ISCII Punjabi", + 65000: "utf-7 - Unicode (UTF-7)", + 65001: "utf-8 - Unicode (UTF-8)", +} diff --git a/types/types.go b/types/types.go index f5fc376..d631afc 100644 --- a/types/types.go +++ b/types/types.go @@ -1,3 +1,17 @@ +// Copyright 2014 Richard Lehane. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package types import (