diff --git a/src/main/java/com/epam/parso/impl/SasDateFormat.java b/src/main/java/com/epam/parso/impl/SasDateFormat.java new file mode 100644 index 0000000..27577c2 --- /dev/null +++ b/src/main/java/com/epam/parso/impl/SasDateFormat.java @@ -0,0 +1,101 @@ +package com.epam.parso.impl; + +import static com.epam.parso.impl.SasFileConstants.SECONDS_IN_DAY; + +/** + * SAS supports wide family of date formats. + * It is reasonable to keep all SAS date related features separately. + * See more about SAS dates: + * - https://v8doc.sas.com/sashtml/lrcon/zenid-63.htm + * - https://v8doc.sas.com/sashtml/lgref/z0197923.htm + * - https://v8doc.sas.com/sashtml/ets/chap2/sect7.htm + */ +final class SasDateFormat { + /** + * Private constructor for utility class. + */ + private SasDateFormat() { + } + + /** + * First time when a leap day is removed from the SAS calendar. + * In seconds since 1960-01-01 + */ + private static final double SAS_SECONDS_29FEB4000 = 64381305600D; + + /** + * Second time when a leap day is removed from the SAS calendar. + * In seconds since 1960-01-01 + */ + private static final double SAS_SECONDS_29FEB8000 = 190609027200D; + + /** + * SAS removes leap day every 4000 year. + * It removes these days: + * - 29FEB4000 + * - 29FEB8000 + * This guy proposed such approach many years ago: https://en.wikipedia.org/wiki/John_Herschel + *
+ * Sometimes people discussed why SAS dates are so strange: + * - https://blogs.sas.com/content/sasdummy/2010/04/05/in-the-year-9999/ + * - https://communities.sas.com/t5/SAS-Programming/Leap-Years-divisible-by-4000/td-p/663467 + *
+ * See the SAS program and its output: + * ```shell + * data test; + * dtime = '28FEB4000:00:00:00'dt; + * put dtime; *out: 64381219200 + *
+ * dtime = '29FEB4000:00:00:00'dt; + * put dtime; *err: ERROR: Invalid date/time/datetime constant '29FEB4000:00:00:00'dt. + *
+ * dtime = '01MAR4000:00:00:00'dt; + * put dtime; *out: 64381305600 + *
+ * dtime = '31DEC4000:00:00:00'dt; + * put dtime; *out: 64407657600 + *
+ * dtime = '28FEB8000:00:00:00'dt; + * put dtime; *out: 190608940800 + *
+ * dtime = '29FEB8000:00:00:00'dt; + * put dtime; *err: ERROR: Invalid date/time/datetime constant '29FEB8000:00:00:00'dt. + *
+ * dtime = '01MAR8000:00:00:00'dt; + * put dtime; * out: 190609027200 + *
+ * dtime = '31DEC8000:00:00:00'dt; + * put dtime; *out: 190635379200 + *
+ * dtime = '31DEC9999:00:00:00'dt; + * put dtime; *out: 253717660800 + * run; + * ``` + * As you can see SAS doesn't accept leap days for 4000 and 8000 years + * and removes these days at all from the SAS calendar. + *
+ * At the same time these leap days are ok for: + * - Java: `LocalDateTime.of(4000, 2, 29, 0, 0).toEpochSecond(ZoneOffset.UTC)` + * outputs 64065686400 + * - JavaScript: `Date.parse('4000-02-29')` + * outputs 64065686400000 + * - GNU/date: `date --utc --date '4000-02-29' +%s` + * outputs 64065686400 + * and so on. + *
+ * So, in order to parse SAS dates correctly, + * we need to restore removed leap days + * + * @param sasSeconds SAS date representation in seconds since 1960-01-01 + * @return seconds with restored leap days + */ + public static double sasLeapDaysFix(double sasSeconds) { + if (sasSeconds >= SAS_SECONDS_29FEB4000) { + if (sasSeconds >= SAS_SECONDS_29FEB8000) { + sasSeconds += SECONDS_IN_DAY; //restore Y8K leap day + } + sasSeconds += SECONDS_IN_DAY; //restore Y4K leap day + } + return sasSeconds; + } +} diff --git a/src/main/java/com/epam/parso/impl/SasFileConstants.java b/src/main/java/com/epam/parso/impl/SasFileConstants.java index 8261616..7f328e0 100644 --- a/src/main/java/com/epam/parso/impl/SasFileConstants.java +++ b/src/main/java/com/epam/parso/impl/SasFileConstants.java @@ -1077,12 +1077,16 @@ public interface SasFileConstants { */ int START_DATES_DAYS_DIFFERENCE = DAYS_IN_YEAR * 10 + 3; + /** + * The number of seconds in a day. + */ + int SECONDS_IN_DAY = SECONDS_IN_MINUTE * MINUTES_IN_HOUR * HOURS_IN_DAY; + /** * The difference in seconds between 01/01/1960 (the dates starting point in SAS) and 01/01/1970 (the dates starting * point in Java). */ - int START_DATES_SECONDS_DIFFERENCE = SECONDS_IN_MINUTE * MINUTES_IN_HOUR * HOURS_IN_DAY - * START_DATES_DAYS_DIFFERENCE; + int START_DATES_SECONDS_DIFFERENCE = SECONDS_IN_DAY * START_DATES_DAYS_DIFFERENCE; /** * The offset to the pointer for the bitwise representation of deleted records in MIX pages in x64. diff --git a/src/main/java/com/epam/parso/impl/SasFileParser.java b/src/main/java/com/epam/parso/impl/SasFileParser.java index 62f81db..157dcf8 100644 --- a/src/main/java/com/epam/parso/impl/SasFileParser.java +++ b/src/main/java/com/epam/parso/impl/SasFileParser.java @@ -1069,8 +1069,12 @@ private String bytesToString(byte[] bytes, int offset, int length) */ private Date bytesToDateTime(byte[] bytes) { double doubleSeconds = bytesToDouble(bytes); - return Double.isNaN(doubleSeconds) ? null : new Date((long) ((doubleSeconds - START_DATES_SECONDS_DIFFERENCE) - * MILLISECONDS_IN_SECONDS)); + if (Double.isNaN(doubleSeconds)) { + return null; + } else { + double seconds = SasDateFormat.sasLeapDaysFix(doubleSeconds) - START_DATES_SECONDS_DIFFERENCE; + return new Date((long) (seconds * MILLISECONDS_IN_SECONDS)); + } } /** @@ -1083,8 +1087,12 @@ private Date bytesToDateTime(byte[] bytes) { */ private Date bytesToDate(byte[] bytes) { double doubleDays = bytesToDouble(bytes); - return Double.isNaN(doubleDays) ? null : new Date((long) ((doubleDays - START_DATES_DAYS_DIFFERENCE) - * SECONDS_IN_MINUTE * MINUTES_IN_HOUR * HOURS_IN_DAY * MILLISECONDS_IN_SECONDS)); + if (Double.isNaN(doubleDays)) { + return null; + } else { + double seconds = SasDateFormat.sasLeapDaysFix(doubleDays * SECONDS_IN_DAY) - START_DATES_SECONDS_DIFFERENCE; + return new Date((long) (seconds * MILLISECONDS_IN_SECONDS)); + } } /** diff --git a/src/test/java/com/epam/parso/BugsTest.java b/src/test/java/com/epam/parso/BugsTest.java index 8022178..4af49b6 100644 --- a/src/test/java/com/epam/parso/BugsTest.java +++ b/src/test/java/com/epam/parso/BugsTest.java @@ -25,13 +25,15 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Paths; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.Date; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; public class BugsTest { - @Test public void testOOM() throws Exception { try (InputStream is = this.getClass().getResourceAsStream("/bugs/mixed_data_one.sas7bdat.oom")) { @@ -216,4 +218,31 @@ public void testInfinityLoopUnbufferedIssue58() throws Exception { assertThat(sasFileReader.getSasFileProperties().getRowCount()).isEqualTo(0); } } + + /** + * Converts year, month and day to UTC Date. + */ + private static Date dateOf(int year, int month, int day) { + return Date.from(LocalDateTime.of(year, month, day, 0, 0).toInstant(ZoneOffset.UTC)); + } + + @Test + public void testLeapDayFixIssue81() throws Exception { + try (InputStream is = this.getClass().getResourceAsStream("/bugs/81-dates.sas7bdat")) { + SasFileReader sasFileReader = new SasFileReaderImpl(is); + + Object[][] result = sasFileReader.readAll(); + assertThat(result.length).isEqualTo(10); + assertThat(result[0][1]).isEqualTo(dateOf(9999, 12, 31)); + assertThat(result[1][1]).isEqualTo(dateOf(2049, 12, 31)); + assertThat(result[2][1]).isEqualTo(dateOf(2099, 12, 31)); + assertThat(result[3][1]).isEqualTo(dateOf(4000, 2, 28)); + assertThat(result[4][1]).isEqualTo(dateOf(4000, 3, 1)); + assertThat(result[5][1]).isEqualTo(dateOf(4000, 12, 31)); + assertThat(result[6][1]).isEqualTo(dateOf(8000, 2, 28)); + assertThat(result[7][1]).isEqualTo(dateOf(8000, 3, 1)); + assertThat(result[8][1]).isEqualTo(dateOf(8000, 12, 31)); + assertThat(result[9][1]).isEqualTo(dateOf(8001, 2, 21)); + } + } } diff --git a/src/test/resources/bugs/81-dates.sas7bdat b/src/test/resources/bugs/81-dates.sas7bdat new file mode 100644 index 0000000..86f53bf Binary files /dev/null and b/src/test/resources/bugs/81-dates.sas7bdat differ diff --git a/src/test/resources/csv/dates_leap_days.csv b/src/test/resources/csv/dates_leap_days.csv new file mode 100644 index 0000000..fbf8af3 --- /dev/null +++ b/src/test/resources/csv/dates_leap_days.csv @@ -0,0 +1,15 @@ +d, dt +28Feb2000,28Feb2000:00:00:00.00 +29Feb2000,29Feb2000:00:00:00.00 +01Mar2000,01Mar2000:00:00:00.00 +31Dec2000,31Dec2000:00:00:00.00 +28Feb4000,28Feb4000:00:00:00.00 +01Mar4000,01Mar4000:00:00:00.00 +31Dec4000,31Dec4000:00:00:00.00 +28Feb6000,28Feb6000:00:00:00.00 +29Feb6000,29Feb6000:00:00:00.00 +01Mar6000,01Mar6000:00:00:00.00 +31Dec6000,31Dec6000:00:00:00.00 +28Feb8000,28Feb8000:00:00:00.00 +01Mar8000,01Mar8000:00:00:00.00 +31Dec8000,31Dec8000:00:00:00.00 diff --git a/src/test/resources/csv/dates_leap_days_meta.csv b/src/test/resources/csv/dates_leap_days_meta.csv new file mode 100644 index 0000000..9e18b06 --- /dev/null +++ b/src/test/resources/csv/dates_leap_days_meta.csv @@ -0,0 +1,23 @@ +Number,Name,Type,Data length,Format,Label +1,d,Numeric,8,DATE9., +2,dt,Numeric,8,DATETIME20., +Bitness: x64 +Compressed: null +Endianness: LITTLE_ENDIANNESS +Encoding: ISO-8859-1 +Name: DATES_LEAP_DAYS +File type: DATA +File label: Leap days dataset +Date created: Fri Jan 01 13:53:59 MSK 2021 +Date modified: Fri Jan 01 13:53:59 MSK 2021 +SAS release: 9.0401M5 +SAS server type: Linux +OS name: x86_64 +OS type: 3.10.0-1160.2.1. +Header Length: 4096 +Page Length: 4096 +Page Count: 1 +Row Length: 16 +Row Count: 14 +Mix Page Row Count: 124 +Columns Count: 2 diff --git a/src/test/resources/sas7bdat/dates_leap_days.sas b/src/test/resources/sas7bdat/dates_leap_days.sas new file mode 100644 index 0000000..a3cbb33 --- /dev/null +++ b/src/test/resources/sas7bdat/dates_leap_days.sas @@ -0,0 +1,57 @@ +/* + SAS program to generate sas7bdat file with two types of columns: date and datetime. + Both columns contain data around leap days. + Years 4000 and 8000 don't have leap days in terms of SAS. + Years 2000 and 6000 have it. + All of them necessary for unit tests. +*/ + +options bufsize=4096 pagesize=15; + +data dev.dates_leap_days(label='Leap days dataset'); + format d date9.; + format dt datetime20.; + + d='28FEB2000'd; + dt='28FEB2000:00:00:00'dt; + output; + d='29FEB2000'd; + dt='29FEB2000:00:00:00'dt; + output; + d='01MAR2000'd; + dt='01MAR2000:00:00:00'dt; + output; + d='31DEC2000'd; + dt='31DEC2000:00:00:00'dt; + output; + d='28FEB4000'd; + dt='28FEB4000:00:00:00'dt; + output; + d='01MAR4000'd; + dt='01MAR4000:00:00:00'dt; + output; + d='31DEC4000'd; + dt='31DEC4000:00:00:00'dt; + output; + d='28FEB6000'd; + dt='28FEB6000:00:00:00'dt; + output; + d='29FEB6000'd; + dt='29FEB6000:00:00:00'dt; + output; + d='01MAR6000'd; + dt='01MAR6000:00:00:00'dt; + output; + d='31DEC6000'd; + dt='31DEC6000:00:00:00'dt; + output; + d='28FEB8000'd; + dt='28FEB8000:00:00:00'dt; + output; + d='01MAR8000'd; + dt='01MAR8000:00:00:00'dt; + output; + d='31DEC8000'd; + dt='31DEC8000:00:00:00'dt; + output; +run; diff --git a/src/test/resources/sas7bdat/dates_leap_days.sas7bdat b/src/test/resources/sas7bdat/dates_leap_days.sas7bdat new file mode 100644 index 0000000..bd701ba Binary files /dev/null and b/src/test/resources/sas7bdat/dates_leap_days.sas7bdat differ