-
Notifications
You must be signed in to change notification settings - Fork 28.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-29486][SQL] CalendarInterval should have 3 fields: months, days and microseconds #26134
Changes from 21 commits
44d66d6
4979e1e
f09b529
0db5b7a
ce879c2
4ee8354
3d954d6
05042e8
4c31401
064be74
211543f
0778f9a
3a6518a
4d8383c
1ac157e
aac92bd
076ce42
3d62a24
2edd8a0
4e97bc0
0e87e2d
2f90189
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
package org.apache.spark.unsafe.types; | ||
|
||
import java.io.Serializable; | ||
import java.util.Objects; | ||
|
||
/** | ||
* The internal representation of interval type. | ||
|
@@ -31,45 +32,50 @@ public final class CalendarInterval implements Serializable { | |
public static final long MICROS_PER_WEEK = MICROS_PER_DAY * 7; | ||
|
||
public final int months; | ||
public final int days; | ||
public final long microseconds; | ||
|
||
public long milliseconds() { | ||
return this.microseconds / MICROS_PER_MILLI; | ||
} | ||
|
||
public CalendarInterval(int months, long microseconds) { | ||
public CalendarInterval(int months, int days, long microseconds) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @LinhongLiu Could you send out a follow up PR to document why we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sure, will do. |
||
this.months = months; | ||
this.days = days; | ||
this.microseconds = microseconds; | ||
} | ||
|
||
public CalendarInterval add(CalendarInterval that) { | ||
int months = this.months + that.months; | ||
int days = this.days + that.days; | ||
long microseconds = this.microseconds + that.microseconds; | ||
return new CalendarInterval(months, microseconds); | ||
return new CalendarInterval(months, days, microseconds); | ||
} | ||
|
||
public CalendarInterval subtract(CalendarInterval that) { | ||
int months = this.months - that.months; | ||
int days = this.days - that.days; | ||
long microseconds = this.microseconds - that.microseconds; | ||
return new CalendarInterval(months, microseconds); | ||
return new CalendarInterval(months, days, microseconds); | ||
} | ||
|
||
public CalendarInterval negate() { | ||
return new CalendarInterval(-this.months, -this.microseconds); | ||
return new CalendarInterval(-this.months, -this.days, -this.microseconds); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object other) { | ||
if (this == other) return true; | ||
if (other == null || !(other instanceof CalendarInterval)) return false; | ||
|
||
CalendarInterval o = (CalendarInterval) other; | ||
return this.months == o.months && this.microseconds == o.microseconds; | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
CalendarInterval that = (CalendarInterval) o; | ||
return months == that.months && | ||
days == that.days && | ||
microseconds == that.microseconds; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return 31 * months + (int) microseconds; | ||
return Objects.hash(months, days, microseconds); | ||
} | ||
|
||
@Override | ||
|
@@ -81,12 +87,13 @@ public String toString() { | |
appendUnit(sb, months % 12, "month"); | ||
} | ||
|
||
if (days != 0) { | ||
appendUnit(sb, days / 7, "week"); | ||
appendUnit(sb, days % 7, "day"); | ||
} | ||
|
||
if (microseconds != 0) { | ||
long rest = microseconds; | ||
appendUnit(sb, rest / MICROS_PER_WEEK, "week"); | ||
rest %= MICROS_PER_WEEK; | ||
appendUnit(sb, rest / MICROS_PER_DAY, "day"); | ||
rest %= MICROS_PER_DAY; | ||
appendUnit(sb, rest / MICROS_PER_HOUR, "hour"); | ||
rest %= MICROS_PER_HOUR; | ||
appendUnit(sb, rest / MICROS_PER_MINUTE, "minute"); | ||
|
@@ -96,7 +103,7 @@ public String toString() { | |
appendUnit(sb, rest / MICROS_PER_MILLI, "millisecond"); | ||
rest %= MICROS_PER_MILLI; | ||
appendUnit(sb, rest, "microsecond"); | ||
} else if (months == 0) { | ||
} else if (months == 0 && days == 0) { | ||
sb.append(" 0 microseconds"); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe | |
import org.apache.spark.sql.catalyst.expressions.codegen.Block._ | ||
import org.apache.spark.sql.catalyst.util.IntervalUtils | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.unsafe.types.CalendarInterval | ||
|
||
case class TimeWindow( | ||
timeColumn: Expression, | ||
|
@@ -107,7 +108,7 @@ object TimeWindow { | |
throw new IllegalArgumentException( | ||
s"Intervals greater than a month is not supported ($interval).") | ||
} | ||
cal.microseconds | ||
cal.days * CalendarInterval.MICROS_PER_DAY + cal.microseconds | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan here we still use a "1 day = 24 hours" assumption. is this OK? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here we use interval as a duration, and it's safe to use |
||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2610,25 +2610,33 @@ object Sequence { | |
override val defaultStep: DefaultStep = new DefaultStep( | ||
(dt.ordering.lteq _).asInstanceOf[LessThanOrEqualFn], | ||
CalendarIntervalType, | ||
new CalendarInterval(0, MICROS_PER_DAY)) | ||
new CalendarInterval(0, 1, 0)) | ||
|
||
private val backedSequenceImpl = new IntegralSequenceImpl[T](dt) | ||
private val microsPerMonth = 28 * CalendarInterval.MICROS_PER_DAY | ||
private val microsPerDay = 24 * CalendarInterval.MICROS_PER_HOUR | ||
// We choose a minimum days(28) in one month to calculate the `intervalStepInMicros` | ||
// in order to make sure the estimated array length is long enough | ||
private val microsPerMonth = 28 * microsPerDay | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not related to the PR but it is interesting why 28 days per months here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the code will use microsPerMonth to estimate the array length first. use the minimum days in one month can make sure the array long enough. (smaller days means smaller steps in micro and means longer array length) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not related to this PR, but would be great if we can add a comment to explain it in the code. |
||
|
||
override def eval(input1: Any, input2: Any, input3: Any): Array[T] = { | ||
val start = input1.asInstanceOf[T] | ||
val stop = input2.asInstanceOf[T] | ||
val step = input3.asInstanceOf[CalendarInterval] | ||
val stepMonths = step.months | ||
val stepDays = step.days | ||
val stepMicros = step.microseconds | ||
|
||
if (stepMonths == 0) { | ||
backedSequenceImpl.eval(start, stop, fromLong(stepMicros / scale)) | ||
if (stepMonths == 0 && stepMicros == 0 && scale == MICROS_PER_DAY) { | ||
backedSequenceImpl.eval(start, stop, fromLong(stepDays)) | ||
|
||
} else if (stepMonths == 0 && stepDays == 0 && scale == 1) { | ||
backedSequenceImpl.eval(start, stop, fromLong(stepMicros)) | ||
|
||
} else { | ||
// To estimate the resulted array length we need to make assumptions | ||
// about a month length in microseconds | ||
val intervalStepInMicros = stepMicros + stepMonths * microsPerMonth | ||
// about a month length in days and a day length in microseconds | ||
val intervalStepInMicros = | ||
stepMicros + stepMonths * microsPerMonth + stepDays * microsPerDay | ||
val startMicros: Long = num.toLong(start) * scale | ||
val stopMicros: Long = num.toLong(stop) * scale | ||
val maxEstimatedArrayLength = | ||
|
@@ -2643,7 +2651,8 @@ object Sequence { | |
while (t < exclusiveItem ^ stepSign < 0) { | ||
arr(i) = fromLong(t / scale) | ||
i += 1 | ||
t = timestampAddInterval(startMicros, i * stepMonths, i * stepMicros, zoneId) | ||
t = timestampAddInterval( | ||
startMicros, i * stepMonths, i * stepDays, i * stepMicros, zoneId) | ||
} | ||
|
||
// truncate array to the correct length | ||
|
@@ -2659,6 +2668,7 @@ object Sequence { | |
arr: String, | ||
elemType: String): String = { | ||
val stepMonths = ctx.freshName("stepMonths") | ||
val stepDays = ctx.freshName("stepDays") | ||
val stepMicros = ctx.freshName("stepMicros") | ||
val stepScaled = ctx.freshName("stepScaled") | ||
val intervalInMicros = ctx.freshName("intervalInMicros") | ||
|
@@ -2673,18 +2683,21 @@ object Sequence { | |
|
||
val sequenceLengthCode = | ||
s""" | ||
|final long $intervalInMicros = $stepMicros + $stepMonths * ${microsPerMonth}L; | ||
|final long $intervalInMicros = | ||
| $stepMicros + $stepMonths * ${microsPerMonth}L + $stepDays * ${microsPerDay}L; | ||
|${genSequenceLengthCode(ctx, startMicros, stopMicros, intervalInMicros, arrLength)} | ||
""".stripMargin | ||
|
||
s""" | ||
|final int $stepMonths = $step.months; | ||
|final int $stepDays = $step.days; | ||
|final long $stepMicros = $step.microseconds; | ||
| | ||
|if ($stepMonths == 0) { | ||
| final $elemType $stepScaled = ($elemType) ($stepMicros / ${scale}L); | ||
| ${backedSequenceImpl.genCode(ctx, start, stop, stepScaled, arr, elemType)}; | ||
|if ($stepMonths == 0 && $stepMicros == 0 && ${scale}L == ${MICROS_PER_DAY}L) { | ||
| ${backedSequenceImpl.genCode(ctx, start, stop, stepDays, arr, elemType)}; | ||
| | ||
|} else if ($stepMonths == 0 && $stepDays == 0 && ${scale}L == 1) { | ||
| ${backedSequenceImpl.genCode(ctx, start, stop, stepMicros, arr, elemType)}; | ||
|} else { | ||
| final long $startMicros = $start * ${scale}L; | ||
| final long $stopMicros = $stop * ${scale}L; | ||
|
@@ -2702,7 +2715,7 @@ object Sequence { | |
| $arr[$i] = ($elemType) ($t / ${scale}L); | ||
| $i += 1; | ||
| $t = org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampAddInterval( | ||
| $startMicros, $i * $stepMonths, $i * $stepMicros, $zid); | ||
| $startMicros, $i * $stepMonths, $i * $stepDays, $i * $stepMicros, $zid); | ||
| } | ||
| | ||
| if ($arr.length > $i) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The method will return different values because you exclude days from microseconds