From bc530e985f85a877733d6b083c86119d5ca3b630 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Sat, 18 Aug 2018 23:58:08 -0700 Subject: [PATCH 01/15] Replace date/time types with .NET standard types. --- src/Microsoft.ML.Api/ApiUtils.cs | 2 +- src/Microsoft.ML.Core/Data/ColumnType.cs | 10 +- src/Microsoft.ML.Core/Data/DataKind.cs | 12 +-- src/Microsoft.ML.Data/Data/Conversion.cs | 97 +++++-------------- .../DataLoadSave/Binary/CodecFactory.cs | 2 +- .../DataLoadSave/Binary/Codecs.cs | 88 ++++++----------- .../DataLoadSave/Binary/UnsafeTypeOps.cs | 12 +-- .../DataLoadSave/Text/TextSaver.cs | 30 +++--- src/Microsoft.ML.Parquet/ParquetLoader.cs | 18 ++-- src/Microsoft.ML.Transforms/NAReplaceUtils.cs | 67 +------------ src/Microsoft.ML/Data/TextLoader.cs | 6 +- .../UnitTests/CoreBaseTestClass.cs | 12 +-- .../DataPipe/TestDataPipeBase.cs | 12 +-- 13 files changed, 115 insertions(+), 253 deletions(-) diff --git a/src/Microsoft.ML.Api/ApiUtils.cs b/src/Microsoft.ML.Api/ApiUtils.cs index 96e821f16e..0270f6cfb2 100644 --- a/src/Microsoft.ML.Api/ApiUtils.cs +++ b/src/Microsoft.ML.Api/ApiUtils.cs @@ -23,7 +23,7 @@ private static OpCode GetAssignmentOpCode(Type t) t == typeof(DvBool) || t == typeof(DvText) || t == typeof(string) || t.IsArray || (t.IsGenericType && t.GetGenericTypeDefinition() == typeof(VBuffer<>)) || (t.IsGenericType && t.GetGenericTypeDefinition() == typeof(Nullable<>)) || - t == typeof(DvDateTime) || t == typeof(DvDateTimeZone) || t == typeof(DvTimeSpan) || t == typeof(UInt128)) + t == typeof(DateTime) || t == typeof(DateTimeOffset) || t == typeof(TimeSpan) || t == typeof(UInt128)) { return OpCodes.Stobj; } diff --git a/src/Microsoft.ML.Core/Data/ColumnType.cs b/src/Microsoft.ML.Core/Data/ColumnType.cs index 96764d68f1..372abe31e5 100644 --- a/src/Microsoft.ML.Core/Data/ColumnType.cs +++ b/src/Microsoft.ML.Core/Data/ColumnType.cs @@ -135,7 +135,7 @@ public bool IsTimeSpan } /// - /// Whether this type is a DvDateTime. + /// Whether this type is a DateTime. /// public bool IsDateTime { @@ -150,7 +150,7 @@ public bool IsDateTime } /// - /// Whether this type is a DvDateTimeZone. + /// Whether this type is a DateTimeOffset. /// public bool IsDateTimeZone { @@ -605,7 +605,7 @@ public static DateTimeType Instance } private DateTimeType() - : base(typeof(DvDateTime), DataKind.DT) + : base(typeof(DateTime), DataKind.DT) { } @@ -637,7 +637,7 @@ public static DateTimeZoneType Instance } private DateTimeZoneType() - : base(typeof(DvDateTimeZone), DataKind.DZ) + : base(typeof(DateTimeOffset), DataKind.DZ) { } @@ -672,7 +672,7 @@ public static TimeSpanType Instance } private TimeSpanType() - : base(typeof(DvTimeSpan), DataKind.TS) + : base(typeof(TimeSpan), DataKind.TS) { } diff --git a/src/Microsoft.ML.Core/Data/DataKind.cs b/src/Microsoft.ML.Core/Data/DataKind.cs index 32325f44a1..a8c7fd8180 100644 --- a/src/Microsoft.ML.Core/Data/DataKind.cs +++ b/src/Microsoft.ML.Core/Data/DataKind.cs @@ -165,11 +165,11 @@ public static Type ToType(this DataKind kind) case DataKind.BL: return typeof(DvBool); case DataKind.TS: - return typeof(DvTimeSpan); + return typeof(TimeSpan); case DataKind.DT: - return typeof(DvDateTime); + return typeof(DateTime); case DataKind.DZ: - return typeof(DvDateTimeZone); + return typeof(DateTimeOffset); case DataKind.UG: return typeof(UInt128); } @@ -209,11 +209,11 @@ public static bool TryGetDataKind(this Type type, out DataKind kind) kind = DataKind.TX; else if (type == typeof(DvBool) || type == typeof(bool) || type == typeof(bool?)) kind = DataKind.BL; - else if (type == typeof(DvTimeSpan)) + else if (type == typeof(TimeSpan)) kind = DataKind.TS; - else if (type == typeof(DvDateTime)) + else if (type == typeof(DateTime)) kind = DataKind.DT; - else if (type == typeof(DvDateTimeZone)) + else if (type == typeof(DateTimeOffset)) kind = DataKind.DZ; else if (type == typeof(UInt128)) kind = DataKind.UG; diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 0a9833064a..6a02c33805 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -15,8 +15,8 @@ namespace Microsoft.ML.Runtime.Data.Conversion { using BL = DvBool; - using DT = DvDateTime; - using DZ = DvDateTimeZone; + using DT = DateTime; + using DZ = DateTimeOffset; using I1 = DvInt1; using I2 = DvInt2; using I4 = DvInt4; @@ -28,7 +28,7 @@ namespace Microsoft.ML.Runtime.Data.Conversion using RawI4 = Int32; using RawI8 = Int64; using SB = StringBuilder; - using TS = DvTimeSpan; + using TS = TimeSpan; using TX = DvText; using U1 = Byte; using U2 = UInt16; @@ -252,9 +252,6 @@ private Conversions() AddIsNA(IsNA); AddIsNA(IsNA); AddIsNA(IsNA); - AddIsNA(IsNA); - AddIsNA
(IsNA); - AddIsNA(IsNA); AddGetNA(GetNA); AddGetNA(GetNA); @@ -264,9 +261,6 @@ private Conversions() AddGetNA(GetNA); AddGetNA(GetNA); AddGetNA(GetNA); - AddGetNA(GetNA); - AddGetNA
(GetNA); - AddGetNA(GetNA); AddHasNA(HasNA); AddHasNA(HasNA); @@ -276,9 +270,6 @@ private Conversions() AddHasNA(HasNA); AddHasNA(HasNA); AddHasNA(HasNA); - AddHasNA(HasNA); - AddHasNA
(HasNA); - AddHasNA(HasNA); AddIsDef(IsDefault); AddIsDef(IsDefault); @@ -853,9 +844,6 @@ public ValueGetter GetNAOrDefaultGetter(ColumnType type) private bool IsNA(ref R4 src) => src.IsNA(); private bool IsNA(ref R8 src) => src.IsNA(); private bool IsNA(ref BL src) => src.IsNA; - private bool IsNA(ref TS src) => src.IsNA; - private bool IsNA(ref DT src) => src.IsNA; - private bool IsNA(ref DZ src) => src.IsNA; private bool IsNA(ref TX src) => src.IsNA; #endregion IsNA @@ -867,9 +855,6 @@ public ValueGetter GetNAOrDefaultGetter(ColumnType type) private bool HasNA(ref VBuffer src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA()) return true; } return false; } private bool HasNA(ref VBuffer src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA()) return true; } return false; } private bool HasNA(ref VBuffer src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA) return true; } return false; } - private bool HasNA(ref VBuffer src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA) return true; } return false; } - private bool HasNA(ref VBuffer
src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA) return true; } return false; } - private bool HasNA(ref VBuffer src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA) return true; } return false; } private bool HasNA(ref VBuffer src) { for (int i = 0; i < src.Count; i++) { if (src.Values[i].IsNA) return true; } return false; } #endregion HasNA @@ -907,9 +892,6 @@ public ValueGetter GetNAOrDefaultGetter(ColumnType type) private void GetNA(ref R4 value) => value = R4.NaN; private void GetNA(ref R8 value) => value = R8.NaN; private void GetNA(ref BL value) => value = BL.NA; - private void GetNA(ref TS value) => value = TS.NA; - private void GetNA(ref DT value) => value = DT.NA; - private void GetNA(ref DZ value) => value = DZ.NA; private void GetNA(ref TX value) => value = TX.NA; #endregion GetNA @@ -1041,9 +1023,9 @@ public void Convert(ref BL src, ref SB dst) else if (src.IsTrue) dst.Append("1"); } - public void Convert(ref TS src, ref SB dst) { ClearDst(ref dst); if (!src.IsNA) dst.AppendFormat("{0:c}", (TimeSpan)src); } - public void Convert(ref DT src, ref SB dst) { ClearDst(ref dst); if (!src.IsNA) dst.AppendFormat("{0:o}", (DateTime)src); } - public void Convert(ref DZ src, ref SB dst) { ClearDst(ref dst); if (!src.IsNA) dst.AppendFormat("{0:o}", (DateTimeOffset)src); } + public void Convert(ref TS src, ref SB dst) { ClearDst(ref dst); dst.AppendFormat("{0:c}", src); } + public void Convert(ref DT src, ref SB dst) { ClearDst(ref dst); dst.AppendFormat("{0:o}", src); } + public void Convert(ref DZ src, ref SB dst) { ClearDst(ref dst); dst.AppendFormat("{0:o}", src); } #endregion ToStringBuilder #region FromR4 @@ -1472,61 +1454,37 @@ public bool TryParse(ref TX src, out R8 dst) public bool TryParse(ref TX src, out TS dst) { + dst = default; if (!src.HasChars) - { - if (src.IsNA) - dst = TS.NA; - else - dst = default(TS); return true; - } - TimeSpan res; - if (TimeSpan.TryParse(src.ToString(), CultureInfo.InvariantCulture, out res)) - { - dst = new TS(res); + + if (TimeSpan.TryParse(src.ToString(), CultureInfo.InvariantCulture, out dst)) return true; - } - dst = TS.NA; + return IsStdMissing(ref src); } public bool TryParse(ref TX src, out DT dst) { + dst = default; if (!src.HasChars) - { - if (src.IsNA) - dst = DvDateTime.NA; - else - dst = default(DvDateTime); return true; - } - DateTime res; - if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out res)) - { - dst = new DT(res); + + if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out dst)) return true; - } - dst = DvDateTime.NA; + return IsStdMissing(ref src); } public bool TryParse(ref TX src, out DZ dst) { + dst = default; if (!src.HasChars) - { - if (src.IsNA) - dst = DvDateTimeZone.NA; - else - dst = default(DvDateTimeZone); return true; - } - DateTimeOffset res; - if (DateTimeOffset.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out res)) - { - dst = new DZ(res); + + if (DateTimeOffset.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out dst)) return true; - } - dst = DvDateTimeZone.NA; + return IsStdMissing(ref src); } @@ -1804,21 +1762,10 @@ public void Convert(ref TX src, ref SB dst) src.AddToStringBuilder(dst); } - public void Convert(ref TX span, ref TS value) - { - if (!TryParse(ref span, out value)) - Contracts.Assert(value.IsNA); - } - public void Convert(ref TX span, ref DT value) - { - if (!TryParse(ref span, out value)) - Contracts.Assert(value.IsNA); - } - public void Convert(ref TX span, ref DZ value) - { - if (!TryParse(ref span, out value)) - Contracts.Assert(value.IsNA); - } + public void Convert(ref TX span, ref TS value) => TryParse(ref span, out value); + public void Convert(ref TX span, ref DT value) => TryParse(ref span, out value); + public void Convert(ref TX span, ref DZ value) => TryParse(ref span, out value); + #endregion FromTX #region FromBL diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs index d04adaf099..edfda0a956 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs @@ -54,7 +54,7 @@ public CodecFactory(IHostEnvironment env, MemoryStreamPool memPool = null) RegisterSimpleCodec(new UnsafeTypeCodec(this)); RegisterSimpleCodec(new UnsafeTypeCodec(this)); RegisterSimpleCodec(new UnsafeTypeCodec(this)); - RegisterSimpleCodec(new UnsafeTypeCodec(this)); + RegisterSimpleCodec(new UnsafeTypeCodec(this)); RegisterSimpleCodec(new DvTextCodec(this)); RegisterSimpleCodec(new BoolCodec(this)); RegisterSimpleCodec(new DateTimeCodec(this)); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index f840773872..56ba668e06 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -179,10 +179,10 @@ public override string LoadName } // Gatekeeper to ensure T is a type that is supported by UnsafeTypeCodec. - // Throws an exception if T is neither a DvTimeSpan nor a NumberType. + // Throws an exception if T is neither a TimeSpan nor a NumberType. private static ColumnType UnsafeColumnType(Type type) { - return type == typeof(DvTimeSpan) ? (ColumnType)TimeSpanType.Instance : NumberType.FromType(type); + return type == typeof(TimeSpan) ? (ColumnType)TimeSpanType.Instance : NumberType.FromType(type); } public UnsafeTypeCodec(CodecFactory factory) @@ -598,24 +598,24 @@ public override void Get(ref DvBool value) } } - private sealed class DateTimeCodec : SimpleCodec + private sealed class DateTimeCodec : SimpleCodec { public DateTimeCodec(CodecFactory factory) : base(factory, DateTimeType.Instance) { } - public override IValueWriter OpenWriter(Stream stream) + public override IValueWriter OpenWriter(Stream stream) { return new Writer(this, stream); } - public override IValueReader OpenReader(Stream stream, int items) + public override IValueReader OpenReader(Stream stream, int items) { return new Reader(this, stream, items); } - private sealed class Writer : ValueWriterBase + private sealed class Writer : ValueWriterBase { private long _numWritten; @@ -624,11 +624,9 @@ public Writer(DateTimeCodec codec, Stream stream) { } - public override void Write(ref DvDateTime value) + public override void Write(ref DateTime value) { - var ticks = value.Ticks.RawValue; - Contracts.Assert(ticks == DvInt8.RawNA || (ulong)ticks <= DvDateTime.MaxTicks); - Writer.Write(ticks); + Writer.Write(value.Ticks); _numWritten++; } @@ -643,10 +641,10 @@ public override long GetCommitLengthEstimate() } } - private sealed class Reader : ValueReaderBase + private sealed class Reader : ValueReaderBase { private int _remaining; - private DvDateTime _value; + private DateTime _value; public Reader(DateTimeCodec codec, Stream stream, int items) : base(codec.Factory, stream) @@ -657,74 +655,55 @@ public Reader(DateTimeCodec codec, Stream stream, int items) public override void MoveNext() { Contracts.Assert(_remaining > 0, "already consumed all values"); - var value = Reader.ReadInt64(); - Contracts.CheckDecode(value == DvInt8.RawNA || (ulong)value <= DvDateTime.MaxTicks); - _value = new DvDateTime(value); + _value = new DateTime(Reader.ReadInt64()); _remaining--; } - public override void Get(ref DvDateTime value) + public override void Get(ref DateTime value) { value = _value; } } } - private sealed class DateTimeZoneCodec : SimpleCodec + private sealed class DateTimeZoneCodec : SimpleCodec { - private readonly MadeObjectPool _shortBufferPool; private readonly MadeObjectPool _longBufferPool; public DateTimeZoneCodec(CodecFactory factory) : base(factory, DateTimeZoneType.Instance) { - _shortBufferPool = new MadeObjectPool(() => null); _longBufferPool = new MadeObjectPool(() => null); } - public override IValueWriter OpenWriter(Stream stream) + public override IValueWriter OpenWriter(Stream stream) { return new Writer(this, stream); } - public override IValueReader OpenReader(Stream stream, int items) + public override IValueReader OpenReader(Stream stream, int items) { return new Reader(this, stream, items); } - private sealed class Writer : ValueWriterBase + private sealed class Writer : ValueWriterBase { - private List _offsets; + private List _offsets; private List _ticks; public Writer(DateTimeZoneCodec codec, Stream stream) : base(codec.Factory, stream) { - _offsets = new List(); + _offsets = new List(); _ticks = new List(); } - public override void Write(ref DvDateTimeZone value) + public override void Write(ref DateTimeOffset value) { Contracts.Assert(_offsets != null, "writer was already committed"); - var ticks = value.ClockDateTime.Ticks; - var offset = value.OffsetMinutes; - - _ticks.Add(ticks.RawValue); - if (ticks.IsNA) - { - Contracts.Assert(offset.IsNA); - _offsets.Add(0); - } - else - { - Contracts.Assert( - offset.RawValue >= DvDateTimeZone.MinMinutesOffset && - offset.RawValue <= DvDateTimeZone.MaxMinutesOffset); - Contracts.Assert(0 <= ticks.RawValue && ticks.RawValue <= DvDateTime.MaxTicks); - _offsets.Add(offset.RawValue); - } + _ticks.Add(value.DateTime.Ticks); + _offsets.Add(value.Offset.Ticks); } public override void Commit() @@ -732,7 +711,7 @@ public override void Commit() Contracts.Assert(_offsets != null, "writer was already committed"); Contracts.Assert(Utils.Size(_offsets) == Utils.Size(_ticks)); - Writer.WriteShortStream(_offsets); // Write the offsets. + Writer.WriteLongStream(_offsets); // Write the offsets. Writer.WriteLongStream(_ticks); // Write the tick values. _offsets = null; _ticks = null; @@ -740,16 +719,16 @@ public override void Commit() public override long GetCommitLengthEstimate() { - return (long)_offsets.Count * (sizeof(Int64) + sizeof(Int16)); + return (long)_offsets.Count * (sizeof(Int64) + sizeof(Int64)); } } - private sealed class Reader : ValueReaderBase + private sealed class Reader : ValueReaderBase { private readonly DateTimeZoneCodec _codec; private readonly int _entries; - private short[] _offsets; + private long[] _offsets; private long[] _ticks; private int _index; private bool _disposed; @@ -761,20 +740,15 @@ public Reader(DateTimeZoneCodec codec, Stream stream, int items) _entries = items; _index = -1; - _offsets = _codec._shortBufferPool.Get(); + _offsets = _codec._longBufferPool.Get(); Utils.EnsureSize(ref _offsets, _entries, false); for (int i = 0; i < _entries; i++) - { - _offsets[i] = Reader.ReadInt16(); - Contracts.CheckDecode(DvDateTimeZone.MinMinutesOffset <= _offsets[i] && _offsets[i] <= DvDateTimeZone.MaxMinutesOffset); - } + _offsets[i] = Reader.ReadInt64(); + _ticks = _codec._longBufferPool.Get(); Utils.EnsureSize(ref _ticks, _entries, false); for (int i = 0; i < _entries; i++) - { _ticks[i] = Reader.ReadInt64(); - Contracts.CheckDecode(_ticks[i] == DvInt8.RawNA || (ulong)_ticks[i] <= DvDateTime.MaxTicks); - } } public override void MoveNext() @@ -783,17 +757,17 @@ public override void MoveNext() Contracts.Check(++_index < _entries, "reader already read all values"); } - public override void Get(ref DvDateTimeZone value) + public override void Get(ref DateTimeOffset value) { Contracts.Assert(!_disposed); - value = new DvDateTimeZone(_ticks[_index], _offsets[_index]); + value = new DateTimeOffset(new DateTime(_ticks[_index]), new TimeSpan(_offsets[_index])); } public override void Dispose() { if (!_disposed) { - _codec._shortBufferPool.Return(_offsets); + _codec._longBufferPool.Return(_offsets); _codec._longBufferPool.Return(_ticks); _offsets = null; _ticks = null; diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs index 026228d6be..152eb2a65e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs @@ -46,7 +46,7 @@ static UnsafeTypeOpsFactory() _type2ops[typeof(UInt64)] = new UInt64UnsafeTypeOps(); _type2ops[typeof(Single)] = new SingleUnsafeTypeOps(); _type2ops[typeof(Double)] = new DoubleUnsafeTypeOps(); - _type2ops[typeof(DvTimeSpan)] = new DvTimeSpanUnsafeTypeOps(); + _type2ops[typeof(TimeSpan)] = new TimeSpanUnsafeTypeOps(); _type2ops[typeof(UInt128)] = new UgUnsafeTypeOps(); } @@ -227,17 +227,17 @@ public override unsafe void Apply(Double[] array, Action func) public override Double Read(BinaryReader reader) { return reader.ReadDouble(); } } - private sealed class DvTimeSpanUnsafeTypeOps : UnsafeTypeOps + private sealed class TimeSpanUnsafeTypeOps : UnsafeTypeOps { public override int Size { get { return sizeof(Int64); } } - public override unsafe void Apply(DvTimeSpan[] array, Action func) + public override unsafe void Apply(TimeSpan[] array, Action func) { - fixed (DvTimeSpan* pArray = array) + fixed (TimeSpan* pArray = array) func(new IntPtr(pArray)); } - public override void Write(DvTimeSpan a, BinaryWriter writer) { writer.Write(a.Ticks.RawValue); } - public override DvTimeSpan Read(BinaryReader reader) { return new DvTimeSpan(reader.ReadInt64()); } + public override void Write(TimeSpan a, BinaryWriter writer) { writer.Write(a.Ticks); } + public override TimeSpan Read(BinaryReader reader) { return new TimeSpan(reader.ReadInt64()); } } private sealed class UgUnsafeTypeOps : UnsafeTypeOps diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs index e36f8545b0..a1c2c85a9e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs @@ -99,17 +99,17 @@ protected ValueWriterBase(PrimitiveType type, int source, char sep) } else if (type.IsTimeSpan) { - ValueMapper c = MapTimeSpan; + ValueMapper c = MapTimeSpan; Conv = (ValueMapper)(Delegate)c; } else if (type.IsDateTime) { - ValueMapper c = MapDateTime; + ValueMapper c = MapDateTime; Conv = (ValueMapper)(Delegate)c; } else if (type.IsDateTimeZone) { - ValueMapper c = MapDateTimeZone; + ValueMapper c = MapDateTimeZone; Conv = (ValueMapper)(Delegate)c; } else @@ -125,17 +125,17 @@ protected void MapText(ref DvText src, ref StringBuilder sb) TextSaverUtils.MapText(ref src, ref sb, Sep); } - protected void MapTimeSpan(ref DvTimeSpan src, ref StringBuilder sb) + protected void MapTimeSpan(ref TimeSpan src, ref StringBuilder sb) { TextSaverUtils.MapTimeSpan(ref src, ref sb); } - protected void MapDateTime(ref DvDateTime src, ref StringBuilder sb) + protected void MapDateTime(ref DateTime src, ref StringBuilder sb) { TextSaverUtils.MapDateTime(ref src, ref sb); } - protected void MapDateTimeZone(ref DvDateTimeZone src, ref StringBuilder sb) + protected void MapDateTimeZone(ref DateTimeOffset src, ref StringBuilder sb) { TextSaverUtils.MapDateTimeZone(ref src, ref sb); } @@ -851,34 +851,34 @@ internal static void MapText(ref DvText src, ref StringBuilder sb, char sep) } } - internal static void MapTimeSpan(ref DvTimeSpan src, ref StringBuilder sb) + internal static void MapTimeSpan(ref TimeSpan src, ref StringBuilder sb) { if (sb == null) sb = new StringBuilder(); else sb.Clear(); - if (!src.IsNA) - sb.AppendFormat("\"{0:c}\"", (TimeSpan)src); + + sb.AppendFormat("\"{0:c}\"", src); } - internal static void MapDateTime(ref DvDateTime src, ref StringBuilder sb) + internal static void MapDateTime(ref DateTime src, ref StringBuilder sb) { if (sb == null) sb = new StringBuilder(); else sb.Clear(); - if (!src.IsNA) - sb.AppendFormat("\"{0:o}\"", (DateTime)src); + + sb.AppendFormat("\"{0:o}\"", src); } - internal static void MapDateTimeZone(ref DvDateTimeZone src, ref StringBuilder sb) + internal static void MapDateTimeZone(ref DateTimeOffset src, ref StringBuilder sb) { if (sb == null) sb = new StringBuilder(); else sb.Clear(); - if (!src.IsNA) - sb.AppendFormat("\"{0:o}\"", (DateTimeOffset)src); + + sb.AppendFormat("\"{0:o}\"", src); } } } diff --git a/src/Microsoft.ML.Parquet/ParquetLoader.cs b/src/Microsoft.ML.Parquet/ParquetLoader.cs index 503debae65..ea9198b91a 100644 --- a/src/Microsoft.ML.Parquet/ParquetLoader.cs +++ b/src/Microsoft.ML.Parquet/ParquetLoader.cs @@ -527,9 +527,9 @@ private Delegate CreateGetterDelegate(int col) case DataType.Decimal: return CreateGetterDelegateCore(col, _parquetConversions.Conv); case DataType.DateTimeOffset: - return CreateGetterDelegateCore(col, _parquetConversions.Conv); + return CreateGetterDelegateCore(col, _parquetConversions.Conv); case DataType.Interval: - return CreateGetterDelegateCore(col, _parquetConversions.Conv); + return CreateGetterDelegateCore(col, _parquetConversions.Conv); default: return CreateGetterDelegateCore(col, _parquetConversions.Conv); } @@ -700,7 +700,7 @@ public ParquetConversions(IChannel channel) public void Conv(ref bool? src, ref DvBool dst) => dst = src ?? DvBool.NA; - public void Conv(ref DateTimeOffset src, ref DvDateTimeZone dst) => dst = src; + public void Conv(ref DateTimeOffset src, ref DateTimeOffset dst) => dst = src; public void Conv(ref IList src, ref DvText dst) => dst = new DvText(ConvertListToString(src)); @@ -727,21 +727,21 @@ public void Conv(ref BigInteger src, ref UInt128 dst) } /// - /// Converts a Parquet Interval data type value to a DvTimeSpan data type value. + /// Converts a Parquet Interval data type value to a TimeSpan data type value. /// /// Parquet Interval value (int : months, int : days, int : milliseconds). - /// DvTimeSpan object. - public void Conv(ref Interval src, ref DvTimeSpan dst) + /// TimeSpan object. + public void Conv(ref Interval src, ref TimeSpan dst) { try { - dst = new DvTimeSpan(TimeSpan.FromDays(src.Months * 30 + src.Days) + TimeSpan.FromMilliseconds(src.Millis)); + dst = TimeSpan.FromDays(src.Months * 30 + src.Days) + TimeSpan.FromMilliseconds(src.Millis); } catch (Exception ex) { // Handle TimeSpan OverflowException - _ch.Error("Cannot convert Inteval to DvTimeSpan. Exception : '{0}'", ex.Message); - dst = DvTimeSpan.NA; + _ch.Error("Cannot convert Inteval to TimeSpan. Exception : '{0}'", ex.Message); + dst = default; } } diff --git a/src/Microsoft.ML.Transforms/NAReplaceUtils.cs b/src/Microsoft.ML.Transforms/NAReplaceUtils.cs index 2340f9b413..fe58be6503 100644 --- a/src/Microsoft.ML.Transforms/NAReplaceUtils.cs +++ b/src/Microsoft.ML.Transforms/NAReplaceUtils.cs @@ -14,7 +14,7 @@ public sealed partial class NAReplaceTransform { private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, ReplacementKind? kind, bool bySlot, IRowCursor cursor, int col) { - ch.Assert(type.ItemType.IsNumber || type.ItemType.IsTimeSpan || type.ItemType.IsDateTime); + ch.Assert(type.ItemType.IsNumber); if (!type.IsVector) { // The type is a scalar. @@ -34,10 +34,6 @@ private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, return new R4.MeanAggregatorOne(ch, cursor, col); case DataKind.R8: return new R8.MeanAggregatorOne(ch, cursor, col); - case DataKind.TS: - return new Long.MeanAggregatorOne(ch, type, cursor, col); - case DataKind.DT: - return new Long.MeanAggregatorOne(ch, type, cursor, col); default: break; } @@ -58,10 +54,6 @@ private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, return new R4.MinMaxAggregatorOne(ch, cursor, col, kind == ReplacementKind.Max); case DataKind.R8: return new R8.MinMaxAggregatorOne(ch, cursor, col, kind == ReplacementKind.Max); - case DataKind.TS: - return new Long.MinMaxAggregatorOne(ch, type, cursor, col, kind == ReplacementKind.Max); - case DataKind.DT: - return new Long.MinMaxAggregatorOne(ch, type, cursor, col, kind == ReplacementKind.Max); default: break; } @@ -90,10 +82,6 @@ private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, return new R4.MeanAggregatorBySlot(ch, type, cursor, col); case DataKind.R8: return new R8.MeanAggregatorBySlot(ch, type, cursor, col); - case DataKind.TS: - return new Long.MeanAggregatorBySlot(ch, type, cursor, col); - case DataKind.DT: - return new Long.MeanAggregatorBySlot(ch, type, cursor, col); default: break; } @@ -114,10 +102,6 @@ private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, return new R4.MinMaxAggregatorBySlot(ch, type, cursor, col, kind == ReplacementKind.Max); case DataKind.R8: return new R8.MinMaxAggregatorBySlot(ch, type, cursor, col, kind == ReplacementKind.Max); - case DataKind.TS: - return new Long.MinMaxAggregatorBySlot(ch, type, cursor, col, kind == ReplacementKind.Max); - case DataKind.DT: - return new Long.MinMaxAggregatorBySlot(ch, type, cursor, col, kind == ReplacementKind.Max); default: break; } @@ -142,10 +126,6 @@ private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, return new R4.MeanAggregatorAcrossSlots(ch, cursor, col); case DataKind.R8: return new R8.MeanAggregatorAcrossSlots(ch, cursor, col); - case DataKind.TS: - return new Long.MeanAggregatorAcrossSlots(ch, type, cursor, col); - case DataKind.DT: - return new Long.MeanAggregatorAcrossSlots(ch, type, cursor, col); default: break; } @@ -166,10 +146,6 @@ private static StatAggregator CreateStatAggregator(IChannel ch, ColumnType type, return new R4.MinMaxAggregatorAcrossSlots(ch, cursor, col, kind == ReplacementKind.Max); case DataKind.R8: return new R8.MinMaxAggregatorAcrossSlots(ch, cursor, col, kind == ReplacementKind.Max); - case DataKind.TS: - return new Long.MinMaxAggregatorAcrossSlots(ch, type, cursor, col, kind == ReplacementKind.Max); - case DataKind.DT: - return new Long.MinMaxAggregatorAcrossSlots(ch, type, cursor, col, kind == ReplacementKind.Max); default: break; } @@ -1655,16 +1631,9 @@ private static Converter CreateConverter(ColumnType type) { Contracts.AssertValue(type); Contracts.Assert(typeof(TItem) == type.ItemType.RawType); - Converter converter; - if (type.ItemType.IsTimeSpan) - converter = new TSConverter(); - else if (type.ItemType.IsDateTime) - converter = new DTConverter(); - else - { - Contracts.Assert(type.ItemType.RawKind == DataKind.I8); - converter = new I8Converter(); - } + Contracts.Assert(type.ItemType.RawKind == DataKind.I8); + + Converter converter = new I8Converter(); return (Converter)converter; } @@ -1694,34 +1663,6 @@ public override DvInt8 FromLong(long val) return (DvInt8)val; } } - - private sealed class TSConverter : Converter - { - public override long ToLong(DvTimeSpan val) - { - return val.Ticks.RawValue; - } - - public override DvTimeSpan FromLong(long val) - { - Contracts.Assert(DvInt8.RawNA != val); - return new DvTimeSpan(val); - } - } - - private sealed class DTConverter : Converter - { - public override long ToLong(DvDateTime val) - { - return val.Ticks.RawValue; - } - - public override DvDateTime FromLong(long val) - { - Contracts.Assert(0 <= val && val <= DvDateTime.MaxTicks); - return new DvDateTime(val); - } - } } } } \ No newline at end of file diff --git a/src/Microsoft.ML/Data/TextLoader.cs b/src/Microsoft.ML/Data/TextLoader.cs index 330412185e..b2f891da8f 100644 --- a/src/Microsoft.ML/Data/TextLoader.cs +++ b/src/Microsoft.ML/Data/TextLoader.cs @@ -184,11 +184,11 @@ private static bool TryGetDataKind(Type type, out DataKind kind) kind = DataKind.TX; else if (type == typeof(DvBool) || type == typeof(bool)) kind = DataKind.BL; - else if (type == typeof(DvTimeSpan) || type == typeof(TimeSpan)) + else if (type == typeof(TimeSpan)) kind = DataKind.TS; - else if (type == typeof(DvDateTime) || type == typeof(DateTime)) + else if (type == typeof(DateTime)) kind = DataKind.DT; - else if (type == typeof(DvDateTimeZone) || type == typeof(TimeZoneInfo)) + else if (type == typeof(DateTimeOffset) || type == typeof(TimeZoneInfo)) kind = DataKind.DZ; else if (type == typeof(UInt128)) kind = DataKind.UG; diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/CoreBaseTestClass.cs b/test/Microsoft.ML.Core.Tests/UnitTests/CoreBaseTestClass.cs index 35859783ad..e24e80ae50 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/CoreBaseTestClass.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/CoreBaseTestClass.cs @@ -180,11 +180,11 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Ticks == y.Ticks); case DataKind.DT: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Ticks == y.Ticks); case DataKind.DZ: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.UG: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); } @@ -223,11 +223,11 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Ticks == y.Ticks); case DataKind.DT: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Ticks == y.Ticks); case DataKind.DZ: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.UG: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); } diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs index b53062c1a8..8f7308b581 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs @@ -906,11 +906,11 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Ticks == y.Ticks); case DataKind.DT: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Ticks == y.Ticks); case DataKind.DZ: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.UG: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); } @@ -949,11 +949,11 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Ticks == y.Ticks); case DataKind.DT: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Ticks == y.Ticks); case DataKind.DZ: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.UG: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); } From e0d66b0c6ee72b2bb11b8e4937990a02d4073f06 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Sun, 19 Aug 2018 00:00:27 -0700 Subject: [PATCH 02/15] rename DateTimeZone to DateTimeOffset. --- src/Microsoft.ML.Core/Data/ColumnType.cs | 20 +++++++++---------- .../DataLoadSave/Binary/CodecFactory.cs | 2 +- .../DataLoadSave/Binary/Codecs.cs | 12 +++++------ src/Microsoft.ML.Parquet/ParquetLoader.cs | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/Microsoft.ML.Core/Data/ColumnType.cs b/src/Microsoft.ML.Core/Data/ColumnType.cs index 372abe31e5..8928c5a715 100644 --- a/src/Microsoft.ML.Core/Data/ColumnType.cs +++ b/src/Microsoft.ML.Core/Data/ColumnType.cs @@ -156,10 +156,10 @@ public bool IsDateTimeZone { get { - if (!(this is DateTimeZoneType)) + if (!(this is DateTimeOffsetType)) return false; - // DateTimeZoneType is a singleton. - Contracts.Assert(this == DateTimeZoneType.Instance); + // DateTimeOffsetType is a singleton. + Contracts.Assert(this == DateTimeOffsetType.Instance); return true; } } @@ -319,7 +319,7 @@ public static PrimitiveType FromKind(DataKind kind) if (kind == DataKind.DT) return DateTimeType.Instance; if (kind == DataKind.DZ) - return DateTimeZoneType.Instance; + return DateTimeOffsetType.Instance; return NumberType.FromKind(kind); } } @@ -623,20 +623,20 @@ public override string ToString() } } - public sealed class DateTimeZoneType : PrimitiveType + public sealed class DateTimeOffsetType : PrimitiveType { - private static volatile DateTimeZoneType _instance; - public static DateTimeZoneType Instance + private static volatile DateTimeOffsetType _instance; + public static DateTimeOffsetType Instance { get { if (_instance == null) - Interlocked.CompareExchange(ref _instance, new DateTimeZoneType(), null); + Interlocked.CompareExchange(ref _instance, new DateTimeOffsetType(), null); return _instance; } } - private DateTimeZoneType() + private DateTimeOffsetType() : base(typeof(DateTimeOffset), DataKind.DZ) { } @@ -645,7 +645,7 @@ public override bool Equals(ColumnType other) { if (other == this) return true; - Contracts.Assert(!(other is DateTimeZoneType)); + Contracts.Assert(!(other is DateTimeOffsetType)); return false; } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs index edfda0a956..93246bb903 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs @@ -58,7 +58,7 @@ public CodecFactory(IHostEnvironment env, MemoryStreamPool memPool = null) RegisterSimpleCodec(new DvTextCodec(this)); RegisterSimpleCodec(new BoolCodec(this)); RegisterSimpleCodec(new DateTimeCodec(this)); - RegisterSimpleCodec(new DateTimeZoneCodec(this)); + RegisterSimpleCodec(new DateTimeOffsetCodec(this)); RegisterSimpleCodec(new UnsafeTypeCodec(this)); // Register the old boolean reading codec. diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index 56ba668e06..3d90efe33e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -666,12 +666,12 @@ public override void Get(ref DateTime value) } } - private sealed class DateTimeZoneCodec : SimpleCodec + private sealed class DateTimeOffsetCodec : SimpleCodec { private readonly MadeObjectPool _longBufferPool; - public DateTimeZoneCodec(CodecFactory factory) - : base(factory, DateTimeZoneType.Instance) + public DateTimeOffsetCodec(CodecFactory factory) + : base(factory, DateTimeOffsetType.Instance) { _longBufferPool = new MadeObjectPool(() => null); } @@ -691,7 +691,7 @@ private sealed class Writer : ValueWriterBase private List _offsets; private List _ticks; - public Writer(DateTimeZoneCodec codec, Stream stream) + public Writer(DateTimeOffsetCodec codec, Stream stream) : base(codec.Factory, stream) { _offsets = new List(); @@ -725,7 +725,7 @@ public override long GetCommitLengthEstimate() private sealed class Reader : ValueReaderBase { - private readonly DateTimeZoneCodec _codec; + private readonly DateTimeOffsetCodec _codec; private readonly int _entries; private long[] _offsets; @@ -733,7 +733,7 @@ private sealed class Reader : ValueReaderBase private int _index; private bool _disposed; - public Reader(DateTimeZoneCodec codec, Stream stream, int items) + public Reader(DateTimeOffsetCodec codec, Stream stream, int items) : base(codec.Factory, stream) { _codec = codec; diff --git a/src/Microsoft.ML.Parquet/ParquetLoader.cs b/src/Microsoft.ML.Parquet/ParquetLoader.cs index ea9198b91a..cba3b74803 100644 --- a/src/Microsoft.ML.Parquet/ParquetLoader.cs +++ b/src/Microsoft.ML.Parquet/ParquetLoader.cs @@ -358,7 +358,7 @@ private ColumnType ConvertFieldType(DataType parquetType) case DataType.Decimal: return NumberType.R8; case DataType.DateTimeOffset: - return DateTimeZoneType.Instance; + return DateTimeOffsetType.Instance; case DataType.Interval: return TimeSpanType.Instance; default: From e55c78064209aa3b30489324fb596a1e495e2729 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Sun, 26 Aug 2018 19:35:36 -0700 Subject: [PATCH 03/15] PR feedback. --- src/Microsoft.ML.Core/Data/ColumnType.cs | 27 +- src/Microsoft.ML.Core/Data/DateTime.cs | 550 ----------------------- src/Microsoft.ML.Data/Data/Conversion.cs | 3 +- src/Microsoft.ML/Data/TextLoader.cs | 2 +- 4 files changed, 11 insertions(+), 571 deletions(-) delete mode 100644 src/Microsoft.ML.Core/Data/DateTime.cs diff --git a/src/Microsoft.ML.Core/Data/ColumnType.cs b/src/Microsoft.ML.Core/Data/ColumnType.cs index 8928c5a715..1656843197 100644 --- a/src/Microsoft.ML.Core/Data/ColumnType.cs +++ b/src/Microsoft.ML.Core/Data/ColumnType.cs @@ -120,47 +120,38 @@ public bool IsBool } /// - /// Whether this type is the standard timespan type. + /// Whether this type is the standard type. /// public bool IsTimeSpan { get { - if (!(this is TimeSpanType)) - return false; - // TimeSpanType is a singleton. - Contracts.Assert(this == TimeSpanType.Instance); - return true; + Contracts.Assert((this == TimeSpanType.Instance) == (this is TimeSpanType)); + return this is TimeSpanType; } } /// - /// Whether this type is a DateTime. + /// Whether this type is a . /// public bool IsDateTime { get { - if (!(this is DateTimeType)) - return false; - // DateTimeType is a singleton. - Contracts.Assert(this == DateTimeType.Instance); - return true; + Contracts.Assert((this == DateTimeType.Instance) == (this is DateTimeType)); + return this is DateTimeType; } } /// - /// Whether this type is a DateTimeOffset. + /// Whether this type is a /// public bool IsDateTimeZone { get { - if (!(this is DateTimeOffsetType)) - return false; - // DateTimeOffsetType is a singleton. - Contracts.Assert(this == DateTimeOffsetType.Instance); - return true; + Contracts.Assert((this == DateTimeOffsetType.Instance) == (this is DateTimeOffsetType)); + return this is DateTimeOffsetType; } } diff --git a/src/Microsoft.ML.Core/Data/DateTime.cs b/src/Microsoft.ML.Core/Data/DateTime.cs deleted file mode 100644 index d11be2a494..0000000000 --- a/src/Microsoft.ML.Core/Data/DateTime.cs +++ /dev/null @@ -1,550 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime.Internal.Utilities; - -namespace Microsoft.ML.Runtime.Data -{ - using Conditional = System.Diagnostics.ConditionalAttribute; - using SysDateTime = System.DateTime; - using SysDateTimeOffset = System.DateTimeOffset; - using SysTimeSpan = System.TimeSpan; - - /// - /// A struct to represent a DateTime column type - /// - public struct DvDateTime : IEquatable, IComparable - { - public const long MaxTicks = 3155378975999999999; - private readonly DvInt8 _ticks; - - /// - /// This ctor initializes _ticks to the value of sdt.Ticks, and ignores its DateTimeKind value. - /// - public DvDateTime(SysDateTime sdt) - { - _ticks = sdt.Ticks; - AssertValid(); - } - - /// - /// This ctor accepts any value for ticks, but produces an NA if ticks is out of the legal range. - /// - public DvDateTime(DvInt8 ticks) - { - if ((ulong)ticks.RawValue > MaxTicks) - _ticks = DvInt8.NA; - else - _ticks = ticks; - AssertValid(); - } - - [Conditional("DEBUG")] - internal void AssertValid() - { - Contracts.Assert((ulong)_ticks.RawValue <= MaxTicks || _ticks.IsNA); - } - - public DvInt8 Ticks - { - get - { - AssertValid(); - return _ticks; - } - } - - // REVIEW: Add more System.DateTime members returning their corresponding 'Dv' types (task 4255). - /// - /// Gets the date component of this object. - /// - public DvDateTime Date - { - get - { - AssertValid(); - if (IsNA) - return NA; - return new DvDateTime(GetSysDateTime().Date); - } - } - - /// - /// Gets a DvDateTime object representing the current UTC date and time. - /// - public static DvDateTime UtcNow { get { return new DvDateTime(SysDateTime.UtcNow); } } - - public bool IsNA - { - get - { - AssertValid(); - return (ulong)_ticks.RawValue > MaxTicks; - } - } - - public static DvDateTime NA - { - get { return new DvDateTime(DvInt8.NA); } - } - - public static explicit operator SysDateTime?(DvDateTime dvDt) - { - if (dvDt.IsNA) - return null; - return dvDt.GetSysDateTime(); - } - - /// - /// Creates a new DvDateTime with the same number of ticks as in sdt, ignoring its DateTimeKind value. - /// - public static implicit operator DvDateTime(SysDateTime sdt) - { - return new DvDateTime(sdt); - } - - public static implicit operator DvDateTime(SysDateTime? sdt) - { - if (sdt == null) - return DvDateTime.NA; - return new DvDateTime(sdt.Value); - } - - public override string ToString() - { - AssertValid(); - if (IsNA) - return ""; - return GetSysDateTime().ToString("o"); - } - - internal SysDateTime GetSysDateTime() - { - AssertValid(); - Contracts.Assert(!IsNA); - return new SysDateTime(_ticks.RawValue); - } - - public bool Equals(DvDateTime other) - { - return _ticks.RawValue == other._ticks.RawValue; - } - - public override bool Equals(object obj) - { - return obj is DvDateTime && Equals((DvDateTime)obj); - } - - public int CompareTo(DvDateTime other) - { - if (_ticks.RawValue == other._ticks.RawValue) - return 0; - return _ticks.RawValue < other._ticks.RawValue ? -1 : 1; - } - - public override int GetHashCode() - { - return _ticks.GetHashCode(); - } - } - - /// - /// A struct to represent a DateTimeZone column type. - /// - public struct DvDateTimeZone : IEquatable, IComparable - { - public const long TicksPerMinute = 600000000; - public const long MaxMinutesOffset = 840; - public const long MinMinutesOffset = -840; - - // Stores the UTC date-time (convert to clock time by adding the offset). - private readonly DvDateTime _dateTime; - // Store the offset in minutes. - private readonly DvInt2 _offset; - - // This assumes (and asserts) that the dt/offset combination is valid. - // Callers should do the validation. - private DvDateTimeZone(DvDateTime dt, DvInt2 offset) - { - _dateTime = dt; - _offset = offset; - AssertValid(); - } - - /// - /// Given a number of ticks for the date time portion and a number of minutes for - /// the time zone offset, this constructs a new DvDateTimeZone. If anything is invalid, - /// it produces NA. - /// - /// The number of clock ticks in the date time portion - /// The time zone offset in minutes - public DvDateTimeZone(DvInt8 ticks, DvInt2 offset) - { - var dt = new DvDateTime(ticks); - if (dt.IsNA || offset.IsNA || MinMinutesOffset > offset.RawValue || offset.RawValue > MaxMinutesOffset) - { - _dateTime = DvDateTime.NA; - _offset = DvInt2.NA; - } - else - { - _offset = offset; - _dateTime = ValidateDate(dt, ref _offset); - } - AssertValid(); - } - - public DvDateTimeZone(SysDateTimeOffset dto) - { - // Since it is constructed from a SysDateTimeOffset, all the validations should work. - var success = TryValidateOffset(dto.Offset.Ticks, out _offset); - Contracts.Assert(success); - _dateTime = ValidateDate(new DvDateTime(dto.DateTime), ref _offset); - Contracts.Assert(!_dateTime.IsNA); - Contracts.Assert(!_offset.IsNA); - AssertValid(); - } - - /// - /// Constructs a DvDateTimeZone from a clock date-time and a time zone offset from UTC. - /// - /// The clock time - /// The offset - public DvDateTimeZone(DvDateTime dt, DvTimeSpan offset) - { - if (dt.IsNA || offset.IsNA || !TryValidateOffset(offset.Ticks, out _offset)) - { - _dateTime = DvDateTime.NA; - _offset = DvInt2.NA; - } - else - _dateTime = ValidateDate(dt, ref _offset); - AssertValid(); - } - - /// - /// This method takes a DvDateTime representing clock time, and a TimeSpan representing an offset, - /// validates that both the clock time and the UTC time (which is the clock time minus the offset) - /// are within the valid range, and returns a DvDateTime representing the UTC time (dateTime-offset). - /// - /// The clock time - /// The offset. This value is assumed to be validated as a legal offset: - /// a value in whole minutes, between -14 and 14 hours. - /// The UTC DvDateTime representing the input clock time minus the offset - private static DvDateTime ValidateDate(DvDateTime dateTime, ref DvInt2 offset) - { - Contracts.Assert(!dateTime.IsNA); - Contracts.Assert(!offset.IsNA); - - // Validate that both the UTC and clock times are legal. - Contracts.Assert(MinMinutesOffset <= offset.RawValue && offset.RawValue <= MaxMinutesOffset); - var offsetTicks = offset.RawValue * TicksPerMinute; - // This operation cannot overflow because offset should have already been validated to be within - // 14 hours and the DateTime instance is more than that distance from the boundaries of Int64. - long utcTicks = dateTime.Ticks.RawValue - offsetTicks; - var dvdt = new DvDateTime(utcTicks); - if (dvdt.IsNA) - offset = DvInt2.NA; - return dvdt; - } - - /// - /// This method takes a TimeSpan offset, validates that it is a legal offset for DvDateTimeZone (i.e. - /// in whole minutes, and between -14 and 14 hours), and returns the offset in number of minutes. - /// - /// - /// - /// - private static bool TryValidateOffset(DvInt8 offsetTicks, out DvInt2 offset) - { - if (offsetTicks.IsNA || offsetTicks.RawValue % TicksPerMinute != 0) - { - offset = DvInt2.NA; - return false; - } - - long mins = offsetTicks.RawValue / TicksPerMinute; - short res = (short)mins; - if (res != mins || res > MaxMinutesOffset || res < MinMinutesOffset) - { - offset = DvInt2.NA; - return false; - } - offset = res; - Contracts.Assert(!offset.IsNA); - return true; - } - - [Conditional("DEBUG")] - private void AssertValid() - { - _dateTime.AssertValid(); - if (_dateTime.IsNA) - Contracts.Assert(_offset.IsNA); - else - { - Contracts.Assert(MinMinutesOffset <= _offset.RawValue && _offset.RawValue <= MaxMinutesOffset); - Contracts.Assert((ulong)(_dateTime.Ticks.RawValue + _offset.RawValue * TicksPerMinute) - <= (ulong)DvDateTime.MaxTicks); - } - } - - public DvDateTime ClockDateTime - { - get - { - AssertValid(); - if (_dateTime.IsNA) - return DvDateTime.NA; - var res = new DvDateTime(_dateTime.Ticks.RawValue + _offset.RawValue * TicksPerMinute); - Contracts.Assert(!res.IsNA); - return res; - } - } - - /// - /// Gets the UTC date and time. - /// - public DvDateTime UtcDateTime - { - get - { - AssertValid(); - if (IsNA) - return DvDateTime.NA; - return _dateTime; - } - } - - /// - /// Gets the offset as a time span. - /// - public DvTimeSpan Offset - { - get - { - AssertValid(); - if (_offset.IsNA) - return DvTimeSpan.NA; - return new DvTimeSpan(_offset.RawValue * TicksPerMinute); - } - } - - /// - /// Gets the offset in minutes. - /// - public DvInt2 OffsetMinutes - { - get - { - AssertValid(); - return _offset; - } - } - - // REVIEW: Add more System.DateTimeOffset members returning their corresponding 'Dv' types (task 4255). - - /// - /// Gets the date component of the ClockDateTime. - /// - public DvDateTime ClockDate - { - get - { - AssertValid(); - if (IsNA) - return DvDateTime.NA; - return ClockDateTime.Date; - } - } - - /// - /// Gets the date component of the UtcDateTime. - /// - public DvDateTime UtcDate - { - get - { - AssertValid(); - if (IsNA) - return DvDateTime.NA; - return _dateTime.Date; - } - } - - /// - /// Gets a DvDateTimeZone object representing the current UTC date and time (with offset=0). - /// - public static DvDateTimeZone UtcNow { get { return new DvDateTimeZone(SysDateTimeOffset.UtcNow); } } - - public bool IsNA - { - get - { - AssertValid(); - return _dateTime.IsNA; - } - } - - // The missing value for DvDateTimeZone is represented by a DvDateTimeZone with _dateTime = DvDateTime.NA - // and _offset = 0. - public static DvDateTimeZone NA - { - get { return new DvDateTimeZone(DvDateTime.NA, DvInt2.NA); } - } - - public static explicit operator SysDateTimeOffset?(DvDateTimeZone dvDto) - { - if (dvDto.IsNA) - return null; - return dvDto.GetSysDateTimeOffset(); - } - - public static implicit operator DvDateTimeZone(SysDateTimeOffset sdto) - { - return new DvDateTimeZone(sdto); - } - - public static implicit operator DvDateTimeZone(SysDateTimeOffset? sdto) - { - if (sdto == null) - return DvDateTimeZone.NA; - return new DvDateTimeZone(sdto.Value); - } - - public override string ToString() - { - AssertValid(); - if (IsNA) - return ""; - - return GetSysDateTimeOffset().ToString("o"); - } - - private DateTimeOffset GetSysDateTimeOffset() - { - AssertValid(); - Contracts.Assert(!IsNA); - return new SysDateTimeOffset(ClockDateTime.GetSysDateTime(), new TimeSpan(0, _offset.RawValue, 0)); - } - - /// - /// Compare two values for equality. Note that this differs from System.DateTimeOffset's - /// definition of Equals, which only compares the UTC values, not the offsets. - /// - public bool Equals(DvDateTimeZone other) - { - return _offset.RawValue == other._offset.RawValue && _dateTime.Equals(other._dateTime); - } - - public override bool Equals(object obj) - { - return obj is DvDateTimeZone && Equals((DvDateTimeZone)obj); - } - - /// - /// Compare two values for ordering. Note that this differs from System.DateTimeOffset's - /// definition of CompareTo, which only compares the UTC values, not the offsets. - /// - public int CompareTo(DvDateTimeZone other) - { - AssertValid(); - other.AssertValid(); - - int res = _dateTime.CompareTo(other._dateTime); - if (res != 0) - return res; - if (_offset.RawValue == other._offset.RawValue) - return 0; - return _offset.RawValue < other._offset.RawValue ? -1 : 1; - } - - public override int GetHashCode() - { - return Hashing.CombineHash(_dateTime.GetHashCode(), _offset.GetHashCode()); - } - } - - /// - /// A struct to represent a DateTime column type - /// - public struct DvTimeSpan : IEquatable, IComparable - { - private readonly DvInt8 _ticks; - - public DvInt8 Ticks { get { return _ticks; } } - - public DvTimeSpan(DvInt8 ticks) - { - _ticks = ticks; - } - - public DvTimeSpan(SysTimeSpan sts) - { - _ticks = sts.Ticks; - } - - public DvTimeSpan(SysTimeSpan? sts) - { - _ticks = sts != null ? sts.GetValueOrDefault().Ticks : DvInt8.NA; - } - - public bool IsNA - { - get { return _ticks.IsNA; } - } - - public static DvTimeSpan NA - { - get { return new DvTimeSpan(DvInt8.NA); } - } - - public static explicit operator SysTimeSpan?(DvTimeSpan ts) - { - if (ts.IsNA) - return null; - return new SysTimeSpan(ts._ticks.RawValue); - } - - public static implicit operator DvTimeSpan(SysTimeSpan sts) - { - return new DvTimeSpan(sts); - } - - public static implicit operator DvTimeSpan(SysTimeSpan? sts) - { - return new DvTimeSpan(sts); - } - - public override string ToString() - { - if (IsNA) - return ""; - return new SysTimeSpan(_ticks.RawValue).ToString("c"); - } - - public bool Equals(DvTimeSpan other) - { - return _ticks.RawValue == other._ticks.RawValue; - } - - public override bool Equals(object obj) - { - return obj is DvTimeSpan && Equals((DvTimeSpan)obj); - } - - public int CompareTo(DvTimeSpan other) - { - if (_ticks.RawValue == other._ticks.RawValue) - return 0; - return _ticks.RawValue < other._ticks.RawValue ? -1 : 1; - } - - public override int GetHashCode() - { - return _ticks.GetHashCode(); - } - } -} diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 6a02c33805..8fbcaa2696 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -1467,8 +1467,7 @@ public bool TryParse(ref TX src, out TS dst) public bool TryParse(ref TX src, out DT dst) { dst = default; - if (!src.HasChars) - return true; + Contracts.Check(src.HasChars, "Missing or empty valyes cannot be converted to boolean value."); if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out dst)) return true; diff --git a/src/Microsoft.ML/Data/TextLoader.cs b/src/Microsoft.ML/Data/TextLoader.cs index b2f891da8f..27d4a9e319 100644 --- a/src/Microsoft.ML/Data/TextLoader.cs +++ b/src/Microsoft.ML/Data/TextLoader.cs @@ -188,7 +188,7 @@ private static bool TryGetDataKind(Type type, out DataKind kind) kind = DataKind.TS; else if (type == typeof(DateTime)) kind = DataKind.DT; - else if (type == typeof(DateTimeOffset) || type == typeof(TimeZoneInfo)) + else if (type == typeof(DateTimeOffset)) kind = DataKind.DZ; else if (type == typeof(UInt128)) kind = DataKind.UG; From 53a84d165642e22d73b7a370f157cf3e290722c9 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Sun, 26 Aug 2018 21:30:55 -0700 Subject: [PATCH 04/15] PR feedback. --- src/Microsoft.ML.Data/Data/Conversion.cs | 21 +++++++++++--- .../DataLoadSave/Binary/Codecs.cs | 28 ++++++++++++------- .../DataPipe/TestDataPipeBase.cs | 8 +++--- 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 8fbcaa2696..3a6c8fa329 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -1456,35 +1456,48 @@ public bool TryParse(ref TX src, out TS dst) { dst = default; if (!src.HasChars) + { + Contracts.Check(!src.IsNA, "Missing values cannot be converted to boolean value."); return true; + } if (TimeSpan.TryParse(src.ToString(), CultureInfo.InvariantCulture, out dst)) return true; - return IsStdMissing(ref src); + Contracts.Check(!IsStdMissing(ref src), "Missing values cannot be converted to boolean value."); + return true; } public bool TryParse(ref TX src, out DT dst) { dst = default; - Contracts.Check(src.HasChars, "Missing or empty valyes cannot be converted to boolean value."); + if (!src.HasChars) + { + Contracts.Check(!src.IsNA, "Missing values cannot be converted to boolean value."); + return true; + } if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out dst)) return true; - return IsStdMissing(ref src); + Contracts.Check(!IsStdMissing(ref src), "Missing values cannot be converted to boolean value."); + return true; } public bool TryParse(ref TX src, out DZ dst) { dst = default; if (!src.HasChars) + { + Contracts.Check(!src.IsNA, "Missing values cannot be converted to boolean value."); return true; + } if (DateTimeOffset.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out dst)) return true; - return IsStdMissing(ref src); + Contracts.Check(!IsStdMissing(ref src), "Missing values cannot be converted to boolean value."); + return true; } // These map unparsable and overflow values to "NA", which is the value Ix.MinValue. Note that this NA diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index 3d90efe33e..044e5f5c31 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -669,11 +669,13 @@ public override void Get(ref DateTime value) private sealed class DateTimeOffsetCodec : SimpleCodec { private readonly MadeObjectPool _longBufferPool; + private readonly MadeObjectPool _shortBufferPool; public DateTimeOffsetCodec(CodecFactory factory) : base(factory, DateTimeOffsetType.Instance) { _longBufferPool = new MadeObjectPool(() => null); + _shortBufferPool = new MadeObjectPool(() => null); } public override IValueWriter OpenWriter(Stream stream) @@ -688,13 +690,13 @@ public override IValueReader OpenReader(Stream stream, int items private sealed class Writer : ValueWriterBase { - private List _offsets; + private List _offsets; private List _ticks; public Writer(DateTimeOffsetCodec codec, Stream stream) : base(codec.Factory, stream) { - _offsets = new List(); + _offsets = new List(); _ticks = new List(); } @@ -703,7 +705,13 @@ public override void Write(ref DateTimeOffset value) Contracts.Assert(_offsets != null, "writer was already committed"); _ticks.Add(value.DateTime.Ticks); - _offsets.Add(value.Offset.Ticks); + + //DateTimeOffset exposes its offset as a TimeSpan, but internally it uses short and in minutes. + //https://github.com/dotnet/coreclr/blob/9499b08eefd895158c3f3c7834e185a73619128d/src/System.Private.CoreLib/shared/System/DateTimeOffset.cs#L51-L53 + //https://github.com/dotnet/coreclr/blob/9499b08eefd895158c3f3c7834e185a73619128d/src/System.Private.CoreLib/shared/System/DateTimeOffset.cs#L286-L292 + //From everything we find online(ISO8601, RFC3339, SQL Server doc, + //the offset supports the range -14 to 14 hours, and only supports minute precision. + _offsets.Add((short)(value.Offset.Ticks / TimeSpan.TicksPerMinute)); } public override void Commit() @@ -711,7 +719,7 @@ public override void Commit() Contracts.Assert(_offsets != null, "writer was already committed"); Contracts.Assert(Utils.Size(_offsets) == Utils.Size(_ticks)); - Writer.WriteLongStream(_offsets); // Write the offsets. + Writer.WriteShortStream(_offsets); // Write the offsets. Writer.WriteLongStream(_ticks); // Write the tick values. _offsets = null; _ticks = null; @@ -719,7 +727,7 @@ public override void Commit() public override long GetCommitLengthEstimate() { - return (long)_offsets.Count * (sizeof(Int64) + sizeof(Int64)); + return (long)_offsets.Count * (sizeof(short) + sizeof(Int64)); } } @@ -728,7 +736,7 @@ private sealed class Reader : ValueReaderBase private readonly DateTimeOffsetCodec _codec; private readonly int _entries; - private long[] _offsets; + private short[] _offsets; private long[] _ticks; private int _index; private bool _disposed; @@ -740,10 +748,10 @@ public Reader(DateTimeOffsetCodec codec, Stream stream, int items) _entries = items; _index = -1; - _offsets = _codec._longBufferPool.Get(); + _offsets = _codec._shortBufferPool.Get(); Utils.EnsureSize(ref _offsets, _entries, false); for (int i = 0; i < _entries; i++) - _offsets[i] = Reader.ReadInt64(); + _offsets[i] = Reader.ReadInt16(); _ticks = _codec._longBufferPool.Get(); Utils.EnsureSize(ref _ticks, _entries, false); @@ -760,14 +768,14 @@ public override void MoveNext() public override void Get(ref DateTimeOffset value) { Contracts.Assert(!_disposed); - value = new DateTimeOffset(new DateTime(_ticks[_index]), new TimeSpan(_offsets[_index])); + value = new DateTimeOffset(new DateTime(_ticks[_index]), new TimeSpan(_offsets[_index] * TimeSpan.TicksPerMinute)); } public override void Dispose() { if (!_disposed) { - _codec._longBufferPool.Return(_offsets); + _codec._shortBufferPool.Return(_offsets); _codec._longBufferPool.Return(_ticks); _offsets = null; _ticks = null; diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs index 8f7308b581..77b4170086 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs @@ -906,9 +906,9 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerOne(r1, r2, col, (x, y) => x.Ticks == y.Ticks); + return GetComparerOne(r1, r2, col, (x, y) => x == y); case DataKind.DT: - return GetComparerOne(r1, r2, col, (x, y) => x.Ticks == y.Ticks); + return GetComparerOne(r1, r2, col, (x, y) => x == y); case DataKind.DZ: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.UG: @@ -949,9 +949,9 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Ticks == y.Ticks); + return GetComparerVec(r1, r2, col, size, (x, y) => x == y); case DataKind.DT: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Ticks == y.Ticks); + return GetComparerVec(r1, r2, col, size, (x, y) => x == y); case DataKind.DZ: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.UG: From b7d4c6b4b6a5b035a6ec889c6735b2e45dc91d67 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Sun, 26 Aug 2018 21:36:55 -0700 Subject: [PATCH 05/15] PR feedback. --- src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index 044e5f5c31..768d72da2e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -709,8 +709,7 @@ public override void Write(ref DateTimeOffset value) //DateTimeOffset exposes its offset as a TimeSpan, but internally it uses short and in minutes. //https://github.com/dotnet/coreclr/blob/9499b08eefd895158c3f3c7834e185a73619128d/src/System.Private.CoreLib/shared/System/DateTimeOffset.cs#L51-L53 //https://github.com/dotnet/coreclr/blob/9499b08eefd895158c3f3c7834e185a73619128d/src/System.Private.CoreLib/shared/System/DateTimeOffset.cs#L286-L292 - //From everything we find online(ISO8601, RFC3339, SQL Server doc, - //the offset supports the range -14 to 14 hours, and only supports minute precision. + //From everything online(ISO8601, RFC3339, SQL Server doc, the offset supports the range -14 to 14 hours, and only supports minute precision. _offsets.Add((short)(value.Offset.Ticks / TimeSpan.TicksPerMinute)); } From dbaac06bba06b66a707bac4a4038abcb5fefc245 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 28 Aug 2018 01:46:33 -0700 Subject: [PATCH 06/15] IDV test. --- src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs | 4 +++- src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs | 7 +++++-- src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs | 6 +++++- .../SingleDebug/Command/Datatypes-datatypes.txt | 2 +- .../SingleRelease/Command/Datatypes-datatypes.txt | 2 +- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs index 93246bb903..dbddfad9b9 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/CodecFactory.cs @@ -64,8 +64,10 @@ public CodecFactory(IHostEnvironment env, MemoryStreamPool memPool = null) // Register the old boolean reading codec. var oldBool = new OldBoolCodec(this); RegisterOtherCodec(oldBool.LoadName, oldBool.GetCodec); - RegisterOtherCodec("VBuffer", GetVBufferCodec); + RegisterOtherCodec("DvDateTimeZone", new DateTimeOffsetCodec(this).GetCodec); + RegisterOtherCodec("DvDateTime", new DateTimeCodec(this).GetCodec); + RegisterOtherCodec("DvTimeSpan", new UnsafeTypeCodec(this).GetCodec); RegisterOtherCodec("Key", GetKeyCodec); } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index 768d72da2e..2e6282be46 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -655,7 +655,8 @@ public Reader(DateTimeCodec codec, Stream stream, int items) public override void MoveNext() { Contracts.Assert(_remaining > 0, "already consumed all values"); - _value = new DateTime(Reader.ReadInt64()); + var ticks = Reader.ReadInt64(); + _value = new DateTime(ticks == long.MinValue ? default : ticks); _remaining--; } @@ -767,7 +768,9 @@ public override void MoveNext() public override void Get(ref DateTimeOffset value) { Contracts.Assert(!_disposed); - value = new DateTimeOffset(new DateTime(_ticks[_index]), new TimeSpan(_offsets[_index] * TimeSpan.TicksPerMinute)); + var ticks = _ticks[_index]; + var offset = _offsets[_index]; + value = new DateTimeOffset(new DateTime(ticks == long.MinValue ? default : ticks), new TimeSpan(0, offset == short.MinValue ? default : offset, 0)); } public override void Dispose() diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs index 152eb2a65e..1da16662c9 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/UnsafeTypeOps.cs @@ -237,7 +237,11 @@ public override unsafe void Apply(TimeSpan[] array, Action func) } public override void Write(TimeSpan a, BinaryWriter writer) { writer.Write(a.Ticks); } - public override TimeSpan Read(BinaryReader reader) { return new TimeSpan(reader.ReadInt64()); } + public override TimeSpan Read(BinaryReader reader) + { + var ticks = reader.ReadInt64(); + return new TimeSpan(ticks == long.MinValue ? default : ticks); + } } private sealed class UgUnsafeTypeOps : UnsafeTypeOps diff --git a/test/BaselineOutput/SingleDebug/Command/Datatypes-datatypes.txt b/test/BaselineOutput/SingleDebug/Command/Datatypes-datatypes.txt index e7d128e400..e37863c293 100644 --- a/test/BaselineOutput/SingleDebug/Command/Datatypes-datatypes.txt +++ b/test/BaselineOutput/SingleDebug/Command/Datatypes-datatypes.txt @@ -16,4 +16,4 @@ bl i1 i2 i4 i8 ts dto dt tx 1 -127 -32767 -2147483647 -9223372036854775807 "7.00:00:00" "2008-11-30T00:00:00.0000000+00:00" "2013-08-05T00:00:00.0000000" xyz "7.00:00:00" "2008-11-30T00:00:00.0000000+00:00" "2013-08-05T00:00:00.0000000" 9 0:0 - + "00:00:00" "0001-01-01T00:00:00.0000000+00:00" "0001-01-01T00:00:00.0000000" diff --git a/test/BaselineOutput/SingleRelease/Command/Datatypes-datatypes.txt b/test/BaselineOutput/SingleRelease/Command/Datatypes-datatypes.txt index e7d128e400..e37863c293 100644 --- a/test/BaselineOutput/SingleRelease/Command/Datatypes-datatypes.txt +++ b/test/BaselineOutput/SingleRelease/Command/Datatypes-datatypes.txt @@ -16,4 +16,4 @@ bl i1 i2 i4 i8 ts dto dt tx 1 -127 -32767 -2147483647 -9223372036854775807 "7.00:00:00" "2008-11-30T00:00:00.0000000+00:00" "2013-08-05T00:00:00.0000000" xyz "7.00:00:00" "2008-11-30T00:00:00.0000000+00:00" "2013-08-05T00:00:00.0000000" 9 0:0 - + "00:00:00" "0001-01-01T00:00:00.0000000+00:00" "0001-01-01T00:00:00.0000000" From 53b4e57022dd93cdae6c31eb3c5720b585a9aa8c Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 28 Aug 2018 09:29:52 -0700 Subject: [PATCH 07/15] add test for parquet loader. --- .../SavePipe/TestParquetNull-Data.txt | 9 ++++++++ .../SavePipe/TestParquetNull-Schema.txt | 4 ++++ .../TestParquetPrimitiveDataTypes-Data.txt | 21 ++++++++++++++++++ .../TestParquetPrimitiveDataTypes-Schema.txt | 16 +++++++++++++ .../SavePipe/TestParquetNull-Data.txt | 9 ++++++++ .../SavePipe/TestParquetNull-Schema.txt | 4 ++++ .../TestParquetPrimitiveDataTypes-Data.txt | 21 ++++++++++++++++++ .../TestParquetPrimitiveDataTypes-Schema.txt | 16 +++++++++++++ .../DataPipe/TestDataPipe.cs | 20 +++++++++++++++++ .../Microsoft.ML.TestFramework.csproj | 1 + .../TestInitialization.cs | 1 + test/data/Parquet/alltypes.parquet | Bin 0 -> 1419 bytes test/data/Parquet/test-null.parquet | Bin 0 -> 349 bytes 13 files changed, 122 insertions(+) create mode 100644 test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Data.txt create mode 100644 test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Schema.txt create mode 100644 test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt create mode 100644 test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt create mode 100644 test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Data.txt create mode 100644 test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Schema.txt create mode 100644 test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt create mode 100644 test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt create mode 100644 test/data/Parquet/alltypes.parquet create mode 100644 test/data/Parquet/test-null.parquet diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Data.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Data.txt new file mode 100644 index 0000000000..c7049cd12a --- /dev/null +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Data.txt @@ -0,0 +1,9 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=foo:I4:0 +#@ col=bar:I4:1 +#@ } +foo bar +1 2 +1 diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Schema.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Schema.txt new file mode 100644 index 0000000000..8fa619c171 --- /dev/null +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetNull-Schema.txt @@ -0,0 +1,4 @@ +---- ParquetLoader ---- +2 columns: + foo: I4 + bar: I4 diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt new file mode 100644 index 0000000000..bbdd0c18ab --- /dev/null +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt @@ -0,0 +1,21 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=Id:I4:0 +#@ col=Timestamp:DZ:1 +#@ col=Message:TX:2 +#@ col=Data:TX:3 +#@ col=IsDeleted:BL:4 +#@ col=Amount:R4:5 +#@ col=TotalAmount:R8:6 +#@ col=Counter:I8:7 +#@ col=Amount2:R8:8 +#@ col=Flag:U1:9 +#@ col=Flag2:I1:10 +#@ col=Flag3:I2:11 +#@ col=Flag4:U2:12 +#@ col=Flag5:TS:13 +#@ } +Id Timestamp Message Data IsDeleted Amount TotalAmount Counter Amount2 Flag Flag2 Flag3 Flag4 Flag5 +1 "2000-01-01T01:01:01.0000000+00:00" Record1 SomeData3 0 125.4 400 300000 3331313 3 -3 -600 600 "3100.00:00:00.1000000" +1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt new file mode 100644 index 0000000000..213ef605e6 --- /dev/null +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt @@ -0,0 +1,16 @@ +---- ParquetLoader ---- +14 columns: + Id: I4 + Timestamp: DateTimeZone + Message: Text + Data: Text + IsDeleted: Bool + Amount: R4 + TotalAmount: R8 + Counter: I8 + Amount2: R8 + Flag: U1 + Flag2: I1 + Flag3: I2 + Flag4: U2 + Flag5: TimeSpan diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Data.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Data.txt new file mode 100644 index 0000000000..c7049cd12a --- /dev/null +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Data.txt @@ -0,0 +1,9 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=foo:I4:0 +#@ col=bar:I4:1 +#@ } +foo bar +1 2 +1 diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Schema.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Schema.txt new file mode 100644 index 0000000000..8fa619c171 --- /dev/null +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetNull-Schema.txt @@ -0,0 +1,4 @@ +---- ParquetLoader ---- +2 columns: + foo: I4 + bar: I4 diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt new file mode 100644 index 0000000000..bbdd0c18ab --- /dev/null +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt @@ -0,0 +1,21 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=Id:I4:0 +#@ col=Timestamp:DZ:1 +#@ col=Message:TX:2 +#@ col=Data:TX:3 +#@ col=IsDeleted:BL:4 +#@ col=Amount:R4:5 +#@ col=TotalAmount:R8:6 +#@ col=Counter:I8:7 +#@ col=Amount2:R8:8 +#@ col=Flag:U1:9 +#@ col=Flag2:I1:10 +#@ col=Flag3:I2:11 +#@ col=Flag4:U2:12 +#@ col=Flag5:TS:13 +#@ } +Id Timestamp Message Data IsDeleted Amount TotalAmount Counter Amount2 Flag Flag2 Flag3 Flag4 Flag5 +1 "2000-01-01T01:01:01.0000000+00:00" Record1 SomeData3 0 125.4 400 300000 3331313 3 -3 -600 600 "3100.00:00:00.1000000" +1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt new file mode 100644 index 0000000000..213ef605e6 --- /dev/null +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt @@ -0,0 +1,16 @@ +---- ParquetLoader ---- +14 columns: + Id: I4 + Timestamp: DateTimeZone + Message: Text + Data: Text + IsDeleted: Bool + Amount: R4 + TotalAmount: R8 + Counter: I8 + Amount2: R8 + Flag: U1 + Flag2: I1 + Flag3: I2 + Flag4: U2 + Flag5: TimeSpan diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index c598879795..ba0749318b 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -145,4 +145,24 @@ public void TestLdaTransformEmptyDocumentException() Assert.True(false, "The LDA transform does not throw expected error on empty documents."); } } + + public sealed partial class TestDataPipe : TestDataPipeBase + { + + [Fact] + public void TestParquetPrimitiveDataTypes() + { + string pathData = GetDataPath(@"..\data\Parquet", "alltypes.parquet"); + TestCore(pathData, false, new[] { "loader=Parquet{bigIntDates=+}" }); + Done(); + } + + [Fact] + public void TestParquetNull() + { + string pathData = GetDataPath(@"..\data\Parquet", "test-null.parquet"); + TestCore(pathData, false, new[] { "loader=Parquet{bigIntDates=+}" }, forceDense: true); + Done(); + } + } } diff --git a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj index a80b2df83e..454d1d7a31 100644 --- a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj +++ b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj @@ -9,6 +9,7 @@ + diff --git a/test/Microsoft.ML.TestFramework/TestInitialization.cs b/test/Microsoft.ML.TestFramework/TestInitialization.cs index ebe0eb0a79..50d54c9f21 100644 --- a/test/Microsoft.ML.TestFramework/TestInitialization.cs +++ b/test/Microsoft.ML.TestFramework/TestInitialization.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Xunit; using Xunit.Abstractions; namespace Microsoft.ML.Runtime.RunTests diff --git a/test/data/Parquet/alltypes.parquet b/test/data/Parquet/alltypes.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e47e6b6e5587d5fb458fee85eab343b29e55af16 GIT binary patch literal 1419 zcmaKsPe>GD6u_V3=kB_RDfNcg8ft@PDYI?cU16aPnpR4twn+yKnNg=QT>tG@2ZIRG zFfBre5W-WGMG^D|%#&G!9y*kjvlH4Omt8*)AF(6KWSfC-}Cur zcm3T`_p32+9Rwavp7olE9Ol?30$iEn-lmvV7DjBXJssa~OneHRzX|UK$|b7-`B-Ty z*U6Cj=FcOn?hpkzzv{>M(}Q!Vf&io5)+5$#ytvmkwIk5qnZh>Q`ZyAXzijDsmboo2 z7TfA|vV3ZA?6*dj(-N83Fztf%*N&H4NepxClTM_7)Y%{7%-XAI3kyz&ghnbY&4mHm zYSq6czwLte?twyP)>}>Nk*8iTxO3lm*%AYoZ(H$@BFn`Eqf1Lm51k4W{fs;?Muzh& zt}VtErLDDH)ppi5YfbbQuREp26chnD2?VaBo(7j5(kmNcYFtxdEr1ykCpZtQ@wjqQ zm1uDFm=!6SLNlzZCGjG)QPosRby4gnh&8u2X@HsNMG)3hHfu_w`A%}yar0&26v)jmk-hH-j`i2uhohqLAcGio%khFDB4phMOnl0M9(4XV&Kuk@Ms^ zT;S1hNUFVZwOU<6G1Y@xD~_bh=FVQ zHX?+FQ$>Wt8p~ZWoUME*VRE{dh%U>qI%fz$zKR$L( OkUtDF)b$Gb4F3ax_YjN# literal 0 HcmV?d00001 diff --git a/test/data/Parquet/test-null.parquet b/test/data/Parquet/test-null.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a4c8a943b312b4b0d238e5b4dac44accbe799e04 GIT binary patch literal 349 zcmZXQu};G<6h)oHR1p$P*s>)L9v}j#iDJbu8M?4BRvj1^knF~lQUwgA<;(apegHdk zh{WKl>-Wy{yYl*BBM?9g@g;)6EKtq}OaK6Xih~1}G7Xw#u^dm}t`M0I#6E#hag_7x zX&{M;Qo4GC-nj#U;c}CbZ0Fp`8SMdvl+k68vp}{vSkMVTDTeq3n<8KR%Tw%UYvO+7f>txfkjW$~&yGClcwS9L7enNzG_}E|F7&hPl literal 0 HcmV?d00001 From 48b4c08d84c5f4ecd56546ff6550794f66c2d42e Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 28 Aug 2018 09:56:18 -0700 Subject: [PATCH 08/15] Update parquet tests. --- .../SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt | 2 +- .../SavePipe/TestParquetPrimitiveDataTypes-Data.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt index bbdd0c18ab..bfa6a34ae1 100644 --- a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt @@ -18,4 +18,4 @@ #@ } Id Timestamp Message Data IsDeleted Amount TotalAmount Counter Amount2 Flag Flag2 Flag3 Flag4 Flag5 1 "2000-01-01T01:01:01.0000000+00:00" Record1 SomeData3 0 125.4 400 300000 3331313 3 -3 -600 600 "3100.00:00:00.1000000" -1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 +1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 "00:00:00" diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt index bbdd0c18ab..bfa6a34ae1 100644 --- a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt @@ -18,4 +18,4 @@ #@ } Id Timestamp Message Data IsDeleted Amount TotalAmount Counter Amount2 Flag Flag2 Flag3 Flag4 Flag5 1 "2000-01-01T01:01:01.0000000+00:00" Record1 SomeData3 0 125.4 400 300000 3331313 3 -3 -600 600 "3100.00:00:00.1000000" -1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 +1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 "00:00:00" From 7f55c59d0b8340683a991cc216eb6f2877d7455d Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 28 Aug 2018 12:59:54 -0700 Subject: [PATCH 09/15] fix build. --- test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index ba0749318b..314bff1b77 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -152,7 +152,7 @@ public sealed partial class TestDataPipe : TestDataPipeBase [Fact] public void TestParquetPrimitiveDataTypes() { - string pathData = GetDataPath(@"..\data\Parquet", "alltypes.parquet"); + string pathData = GetDataPath(@"Parquet", "alltypes.parquet"); TestCore(pathData, false, new[] { "loader=Parquet{bigIntDates=+}" }); Done(); } @@ -160,7 +160,7 @@ public void TestParquetPrimitiveDataTypes() [Fact] public void TestParquetNull() { - string pathData = GetDataPath(@"..\data\Parquet", "test-null.parquet"); + string pathData = GetDataPath(@"Parquet", "test-null.parquet"); TestCore(pathData, false, new[] { "loader=Parquet{bigIntDates=+}" }, forceDense: true); Done(); } From 71f87017c313ada0b90e5fdb8ce81a4afba0f8b5 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 28 Aug 2018 13:12:33 -0700 Subject: [PATCH 10/15] PR feedback. --- src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index 2e6282be46..09bfc5636b 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -711,7 +711,7 @@ public override void Write(ref DateTimeOffset value) //https://github.com/dotnet/coreclr/blob/9499b08eefd895158c3f3c7834e185a73619128d/src/System.Private.CoreLib/shared/System/DateTimeOffset.cs#L51-L53 //https://github.com/dotnet/coreclr/blob/9499b08eefd895158c3f3c7834e185a73619128d/src/System.Private.CoreLib/shared/System/DateTimeOffset.cs#L286-L292 //From everything online(ISO8601, RFC3339, SQL Server doc, the offset supports the range -14 to 14 hours, and only supports minute precision. - _offsets.Add((short)(value.Offset.Ticks / TimeSpan.TicksPerMinute)); + _offsets.Add((short)(value.Offset.TotalMinutes)); } public override void Commit() From 1ccc26349cc15beeabe73b9685b29866b1d79ed2 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 28 Aug 2018 21:44:35 -0700 Subject: [PATCH 11/15] clean up. --- test/Microsoft.ML.TestFramework/TestCommandBase.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index 943ef77b0b..d048da6d68 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2033,10 +2033,6 @@ public void CommandTrainingBinaryFieldAwareFactorizationMachineWithValidationAnd [Fact] public void DataTypes() { - //Skip for linux because DATE/TIME format is different. - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - return; - string idvPath = GetDataPath("datatypes.idv"); OutputPath textOutputPath = CreateOutputPath("datatypes.txt"); TestCore("savedata", idvPath, "loader=binary", "saver=text", textOutputPath.Arg("dout")); From 0a1ba0589e21f01dfbcdcc80a4adf0712097299f Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Thu, 30 Aug 2018 14:34:58 -0700 Subject: [PATCH 12/15] PR feedback. --- test/BaselineOutput/SingleDebug/Command/DataTypes-1-out.txt | 1 + test/BaselineOutput/SingleDebug/Command/DataTypes-2-out.txt | 1 + .../BaselineOutput/SingleRelease/Command/DataTypes-1-out.txt | 1 + .../BaselineOutput/SingleRelease/Command/DataTypes-2-out.txt | 1 + test/Microsoft.ML.TestFramework/TestCommandBase.cs | 5 +++++ 5 files changed, 9 insertions(+) create mode 100644 test/BaselineOutput/SingleDebug/Command/DataTypes-1-out.txt create mode 100644 test/BaselineOutput/SingleDebug/Command/DataTypes-2-out.txt create mode 100644 test/BaselineOutput/SingleRelease/Command/DataTypes-1-out.txt create mode 100644 test/BaselineOutput/SingleRelease/Command/DataTypes-2-out.txt diff --git a/test/BaselineOutput/SingleDebug/Command/DataTypes-1-out.txt b/test/BaselineOutput/SingleDebug/Command/DataTypes-1-out.txt new file mode 100644 index 0000000000..fe04f014c2 --- /dev/null +++ b/test/BaselineOutput/SingleDebug/Command/DataTypes-1-out.txt @@ -0,0 +1 @@ +Wrote 5 rows across 9 columns in %Time% diff --git a/test/BaselineOutput/SingleDebug/Command/DataTypes-2-out.txt b/test/BaselineOutput/SingleDebug/Command/DataTypes-2-out.txt new file mode 100644 index 0000000000..a2aaab4439 --- /dev/null +++ b/test/BaselineOutput/SingleDebug/Command/DataTypes-2-out.txt @@ -0,0 +1 @@ +Wrote 5 rows of length 9 diff --git a/test/BaselineOutput/SingleRelease/Command/DataTypes-1-out.txt b/test/BaselineOutput/SingleRelease/Command/DataTypes-1-out.txt new file mode 100644 index 0000000000..fe04f014c2 --- /dev/null +++ b/test/BaselineOutput/SingleRelease/Command/DataTypes-1-out.txt @@ -0,0 +1 @@ +Wrote 5 rows across 9 columns in %Time% diff --git a/test/BaselineOutput/SingleRelease/Command/DataTypes-2-out.txt b/test/BaselineOutput/SingleRelease/Command/DataTypes-2-out.txt new file mode 100644 index 0000000000..a2aaab4439 --- /dev/null +++ b/test/BaselineOutput/SingleRelease/Command/DataTypes-2-out.txt @@ -0,0 +1 @@ +Wrote 5 rows of length 9 diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index d048da6d68..a14677494b 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2034,8 +2034,13 @@ public void CommandTrainingBinaryFieldAwareFactorizationMachineWithValidationAnd public void DataTypes() { string idvPath = GetDataPath("datatypes.idv"); + OutputPath intermediateData = CreateOutputPath("intermediateDatatypes.idv"); OutputPath textOutputPath = CreateOutputPath("datatypes.txt"); TestCore("savedata", idvPath, "loader=binary", "saver=text", textOutputPath.Arg("dout")); + _step++; + TestCore("savedata", idvPath, "loader=binary", "saver=binary", intermediateData.ArgOnly("dout")); + _step++; + TestCore("savedata", intermediateData.Path, "loader=binary", "saver=text", textOutputPath.Arg("dout")); Done(); } } From 401fa6451f6827df3eeb7bb1f817758225aa1199 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Thu, 30 Aug 2018 14:42:46 -0700 Subject: [PATCH 13/15] rebuild. --- .../DataPipe/TestDataPipeBase.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs index d189e627f5..6166da4c49 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs @@ -1010,11 +1010,11 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x == y); case DataKind.DT: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x == y); case DataKind.DZ: - return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); + return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case DataKind.UG: return GetComparerOne(r1, r2, col, (x, y) => x.Equals(y)); case (DataKind)0: @@ -1056,11 +1056,11 @@ protected Func GetColumnComparer(IRow r1, IRow r2, int col, ColumnType typ case DataKind.Bool: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.TimeSpan: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x == y); case DataKind.DT: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x == y); case DataKind.DZ: - return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); + return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); case DataKind.UG: return GetComparerVec(r1, r2, col, size, (x, y) => x.Equals(y)); } From 7cc7d9a9383dc98ee712c4722a37607413b66cb8 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Mon, 3 Sep 2018 17:19:18 -0700 Subject: [PATCH 14/15] merge parquet test. --- src/Microsoft.ML.Parquet/ParquetLoader.cs | 11 +------- .../TestParquetPrimitiveDataTypes-Data.txt | 28 ++++++++----------- .../TestParquetPrimitiveDataTypes-Schema.txt | 24 ++++++---------- .../TestParquetPrimitiveDataTypes-Data.txt | 28 ++++++++----------- .../TestParquetPrimitiveDataTypes-Schema.txt | 24 ++++++---------- .../DataPipe/TestDataPipe.cs | 20 ------------- 6 files changed, 41 insertions(+), 94 deletions(-) diff --git a/src/Microsoft.ML.Parquet/ParquetLoader.cs b/src/Microsoft.ML.Parquet/ParquetLoader.cs index cba3b74803..e7d7df0c0d 100644 --- a/src/Microsoft.ML.Parquet/ParquetLoader.cs +++ b/src/Microsoft.ML.Parquet/ParquetLoader.cs @@ -733,16 +733,7 @@ public void Conv(ref BigInteger src, ref UInt128 dst) /// TimeSpan object. public void Conv(ref Interval src, ref TimeSpan dst) { - try - { - dst = TimeSpan.FromDays(src.Months * 30 + src.Days) + TimeSpan.FromMilliseconds(src.Millis); - } - catch (Exception ex) - { - // Handle TimeSpan OverflowException - _ch.Error("Cannot convert Inteval to TimeSpan. Exception : '{0}'", ex.Message); - dst = default; - } + dst = TimeSpan.FromDays(src.Months * 30 + src.Days) + TimeSpan.FromMilliseconds(src.Millis); } private string ConvertListToString(IList list) diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt index bfa6a34ae1..af1e19e1cc 100644 --- a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Data.txt @@ -1,21 +1,15 @@ #@ TextLoader{ #@ header+ #@ sep=tab -#@ col=Id:I4:0 -#@ col=Timestamp:DZ:1 -#@ col=Message:TX:2 -#@ col=Data:TX:3 -#@ col=IsDeleted:BL:4 -#@ col=Amount:R4:5 -#@ col=TotalAmount:R8:6 -#@ col=Counter:I8:7 -#@ col=Amount2:R8:8 -#@ col=Flag:U1:9 -#@ col=Flag2:I1:10 -#@ col=Flag3:I2:11 -#@ col=Flag4:U2:12 -#@ col=Flag5:TS:13 +#@ col=sbyte:I1:0 +#@ col=short:I2:1 +#@ col=int:I4:2 +#@ col=long:I8:3 +#@ col=bool:BL:4 +#@ col=DateTimeOffset:DZ:5 +#@ col=Interval:TS:6 +#@ col=string:TX:7 #@ } -Id Timestamp Message Data IsDeleted Amount TotalAmount Counter Amount2 Flag Flag2 Flag3 Flag4 Flag5 -1 "2000-01-01T01:01:01.0000000+00:00" Record1 SomeData3 0 125.4 400 300000 3331313 3 -3 -600 600 "3100.00:00:00.1000000" -1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 "00:00:00" +sbyte short int long bool DateTimeOffset Interval string + 1 "2018-09-01T19:53:18.2910000+00:00" "31.00:00:00.0010000" "" +127 32767 2147483647 9223372036854775807 0 "2018-09-01T19:53:18.3110000+00:00" "31.00:00:00.0010000" """""" diff --git a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt index 213ef605e6..505df9d958 100644 --- a/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt +++ b/test/BaselineOutput/SingleDebug/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt @@ -1,16 +1,10 @@ ---- ParquetLoader ---- -14 columns: - Id: I4 - Timestamp: DateTimeZone - Message: Text - Data: Text - IsDeleted: Bool - Amount: R4 - TotalAmount: R8 - Counter: I8 - Amount2: R8 - Flag: U1 - Flag2: I1 - Flag3: I2 - Flag4: U2 - Flag5: TimeSpan +8 columns: + sbyte: I1 + short: I2 + int: I4 + long: I8 + bool: Bool + DateTimeOffset: DateTimeZone + Interval: TimeSpan + string: Text diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt index bfa6a34ae1..af1e19e1cc 100644 --- a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Data.txt @@ -1,21 +1,15 @@ #@ TextLoader{ #@ header+ #@ sep=tab -#@ col=Id:I4:0 -#@ col=Timestamp:DZ:1 -#@ col=Message:TX:2 -#@ col=Data:TX:3 -#@ col=IsDeleted:BL:4 -#@ col=Amount:R4:5 -#@ col=TotalAmount:R8:6 -#@ col=Counter:I8:7 -#@ col=Amount2:R8:8 -#@ col=Flag:U1:9 -#@ col=Flag2:I1:10 -#@ col=Flag3:I2:11 -#@ col=Flag4:U2:12 -#@ col=Flag5:TS:13 +#@ col=sbyte:I1:0 +#@ col=short:I2:1 +#@ col=int:I4:2 +#@ col=long:I8:3 +#@ col=bool:BL:4 +#@ col=DateTimeOffset:DZ:5 +#@ col=Interval:TS:6 +#@ col=string:TX:7 #@ } -Id Timestamp Message Data IsDeleted Amount TotalAmount Counter Amount2 Flag Flag2 Flag3 Flag4 Flag5 -1 "2000-01-01T01:01:01.0000000+00:00" Record1 SomeData3 0 125.4 400 300000 3331313 3 -3 -600 600 "3100.00:00:00.1000000" -1 "2000-12-31T23:59:59.9990000+00:00" Record2 SomeData4 0 126.4 500 400000 4331313 4 -4 -700 700 "00:00:00" +sbyte short int long bool DateTimeOffset Interval string + 1 "2018-09-01T19:53:18.2910000+00:00" "31.00:00:00.0010000" "" +127 32767 2147483647 9223372036854775807 0 "2018-09-01T19:53:18.3110000+00:00" "31.00:00:00.0010000" """""" diff --git a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt index 213ef605e6..505df9d958 100644 --- a/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt +++ b/test/BaselineOutput/SingleRelease/SavePipe/TestParquetPrimitiveDataTypes-Schema.txt @@ -1,16 +1,10 @@ ---- ParquetLoader ---- -14 columns: - Id: I4 - Timestamp: DateTimeZone - Message: Text - Data: Text - IsDeleted: Bool - Amount: R4 - TotalAmount: R8 - Counter: I8 - Amount2: R8 - Flag: U1 - Flag2: I1 - Flag3: I2 - Flag4: U2 - Flag5: TimeSpan +8 columns: + sbyte: I1 + short: I2 + int: I4 + long: I8 + bool: Bool + DateTimeOffset: DateTimeZone + Interval: TimeSpan + string: Text diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index 4fa03376d2..3abff3e560 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -243,24 +243,4 @@ public void TestLdaTransformEmptyDocumentException() Assert.True(false, "The LDA transform does not throw expected error on empty documents."); } } - - public sealed partial class TestDataPipe : TestDataPipeBase - { - - [Fact] - public void TestParquetPrimitiveDataTypes() - { - string pathData = GetDataPath(@"Parquet", "alltypes.parquet"); - TestCore(pathData, false, new[] { "loader=Parquet{bigIntDates=+}" }); - Done(); - } - - [Fact] - public void TestParquetNull() - { - string pathData = GetDataPath(@"Parquet", "test-null.parquet"); - TestCore(pathData, false, new[] { "loader=Parquet{bigIntDates=+}" }, forceDense: true); - Done(); - } - } } From ea943d628548761c540f653b67a977b1a1fab6fe Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Fri, 7 Sep 2018 12:00:34 -0700 Subject: [PATCH 15/15] merge master. --- src/Microsoft.ML.Data/StaticPipe/StaticSchemaShape.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/StaticPipe/StaticSchemaShape.cs b/src/Microsoft.ML.Data/StaticPipe/StaticSchemaShape.cs index 39cc6cd316..9f8890f732 100644 --- a/src/Microsoft.ML.Data/StaticPipe/StaticSchemaShape.cs +++ b/src/Microsoft.ML.Data/StaticPipe/StaticSchemaShape.cs @@ -168,7 +168,7 @@ private static Type GetTypeOrNull(SchemaShape.Column col) pt == NumberType.I1 || pt == NumberType.I2 || pt == NumberType.I4 || pt == NumberType.I4 || pt == NumberType.U1 || pt == NumberType.U2 || pt == NumberType.U4 || pt == NumberType.U4 || pt == NumberType.R4 || pt == NumberType.R8 || pt == NumberType.UG || pt == BoolType.Instance || - pt == DateTimeType.Instance || pt == DateTimeZoneType.Instance || pt == TimeSpanType.Instance || + pt == DateTimeType.Instance || pt == DateTimeOffsetType.Instance || pt == TimeSpanType.Instance || pt == TextType.Instance)) { return (vecType ?? typeof(Scalar<>)).MakeGenericType(physType); @@ -312,7 +312,7 @@ private static Type GetTypeOrNull(IColumn col) pt == NumberType.I1 || pt == NumberType.I2 || pt == NumberType.I4 || pt == NumberType.I8 || pt == NumberType.U1 || pt == NumberType.U2 || pt == NumberType.U4 || pt == NumberType.U8 || pt == NumberType.R4 || pt == NumberType.R8 || pt == NumberType.UG || pt == BoolType.Instance || - pt == DateTimeType.Instance || pt == DateTimeZoneType.Instance || pt == TimeSpanType.Instance || + pt == DateTimeType.Instance || pt == DateTimeOffsetType.Instance || pt == TimeSpanType.Instance || pt == TextType.Instance)) { return (vecType ?? typeof(Scalar<>)).MakeGenericType(physType);