-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-32106][SQL]Implement SparkScriptTransformationExec in sql/core #29085
Changes from 1 commit
dfcec3c
e53744b
a693722
5bfa669
ec754e2
a2b12a1
c3dc66b
cb19b7b
ce8a0a5
d37ef86
fce25ff
f3e05c6
5c049b5
04684a8
6811721
fc96e1f
ed901af
a6f1e7d
e367c05
e74d04c
4ef4d76
22d223c
72b2155
a3628ac
e16c136
858f4e5
cfecc90
43d0f24
9e18fa8
9537d9b
5227441
670f21b
ce8184a
4615733
08d97c8
33923b6
f5ec656
7916d72
a769aa7
d93f7fa
be80c27
7f3cff8
03d3409
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.execution | ||
|
||
import java.io._ | ||
import java.nio.charset.StandardCharsets | ||
|
||
import scala.collection.JavaConverters._ | ||
import scala.util.control.NonFatal | ||
|
||
import org.apache.hadoop.conf.Configuration | ||
|
||
import org.apache.spark.TaskContext | ||
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} | ||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema | ||
import org.apache.spark.sql.types.DataType | ||
import org.apache.spark.util.{CircularBuffer, RedirectThread} | ||
|
||
/** | ||
* Transforms the input by forking and running the specified script. | ||
* | ||
* @param input the set of expression that should be passed to the script. | ||
* @param script the command that should be executed. | ||
* @param output the attributes that are produced by the script. | ||
*/ | ||
case class SparkScriptTransformationExec( | ||
input: Seq[Expression], | ||
script: String, | ||
output: Seq[Attribute], | ||
child: SparkPlan, | ||
ioschema: SparkScriptIOSchema) | ||
extends BaseScriptTransformationExec { | ||
|
||
override def processIterator(inputIterator: Iterator[InternalRow], hadoopConf: Configuration) | ||
: Iterator[InternalRow] = { | ||
val cmd = List("/bin/bash", "-c", script) | ||
val builder = new ProcessBuilder(cmd.asJava) | ||
|
||
val proc = builder.start() | ||
val inputStream = proc.getInputStream | ||
val outputStream = proc.getOutputStream | ||
val errorStream = proc.getErrorStream | ||
|
||
// In order to avoid deadlocks, we need to consume the error output of the child process. | ||
// To avoid issues caused by large error output, we use a circular buffer to limit the amount | ||
// of error output that we retain. See SPARK-7862 for more discussion of the deadlock / hang | ||
// that motivates this. | ||
val stderrBuffer = new CircularBuffer(2048) | ||
new RedirectThread( | ||
errorStream, | ||
stderrBuffer, | ||
"Thread-ScriptTransformation-STDERR-Consumer").start() | ||
AngersZhuuuu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
val outputProjection = new InterpretedProjection(input, child.output) | ||
|
||
// This new thread will consume the ScriptTransformation's input rows and write them to the | ||
// external process. That process's output will be read by this current thread. | ||
val writerThread = new ScriptTransformationWriterThread( | ||
inputIterator.map(outputProjection), | ||
input.map(_.dataType), | ||
ioschema, | ||
outputStream, | ||
proc, | ||
stderrBuffer, | ||
TaskContext.get(), | ||
hadoopConf | ||
) | ||
|
||
val reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)) | ||
val outputIterator: Iterator[InternalRow] = new Iterator[InternalRow] { | ||
var curLine: String = null | ||
val mutableRow = new SpecificInternalRow(output.map(_.dataType)) | ||
|
||
override def hasNext: Boolean = { | ||
try { | ||
if (curLine == null) { | ||
curLine = reader.readLine() | ||
if (curLine == null) { | ||
checkFailureAndPropagate(writerThread, null, proc, stderrBuffer) | ||
return false | ||
} | ||
} | ||
true | ||
} catch { | ||
case NonFatal(e) => | ||
// If this exception is due to abrupt / unclean termination of `proc`, | ||
// then detect it and propagate a better exception message for end users | ||
checkFailureAndPropagate(writerThread, e, proc, stderrBuffer) | ||
|
||
throw e | ||
} | ||
} | ||
|
||
override def next(): InternalRow = { | ||
if (!hasNext) { | ||
throw new NoSuchElementException | ||
} | ||
val prevLine = curLine | ||
curLine = reader.readLine() | ||
if (!ioschema.schemaLess) { | ||
new GenericInternalRow( | ||
prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD")) | ||
.map(CatalystTypeConverters.convertToCatalyst)) | ||
} else { | ||
new GenericInternalRow( | ||
prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"), 2) | ||
.map(CatalystTypeConverters.convertToCatalyst)) | ||
} | ||
} | ||
} | ||
|
||
writerThread.start() | ||
|
||
outputIterator | ||
} | ||
} | ||
|
||
private class ScriptTransformationWriterThread( | ||
iter: Iterator[InternalRow], | ||
inputSchema: Seq[DataType], | ||
ioSchema: SparkScriptIOSchema, | ||
outputStream: OutputStream, | ||
proc: Process, | ||
stderrBuffer: CircularBuffer, | ||
taskContext: TaskContext, | ||
conf: Configuration) | ||
extends BaseScriptTransformationWriterThread( | ||
AngersZhuuuu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
iter, | ||
inputSchema, | ||
ioSchema, | ||
outputStream, | ||
proc, | ||
stderrBuffer, | ||
taskContext, | ||
conf) { | ||
|
||
setDaemon(true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can remove this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
override def processRows(): Unit = { | ||
processRowsWithoutSerde() | ||
} | ||
} | ||
|
||
object SparkScriptIOSchema { | ||
def apply(input: ScriptInputOutputSchema): SparkScriptIOSchema = { | ||
SparkScriptIOSchema( | ||
input.inputRowFormat, | ||
input.outputRowFormat, | ||
input.inputSerdeClass, | ||
input.outputSerdeClass, | ||
input.inputSerdeProps, | ||
input.outputSerdeProps, | ||
input.recordReaderClass, | ||
input.recordWriterClass, | ||
input.schemaLess) | ||
} | ||
} | ||
|
||
/** | ||
* The wrapper class of Spark script transformation input and output schema properties | ||
*/ | ||
case class SparkScriptIOSchema ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this class so big while it doesn't support hive serde? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For this , I think we should change this after decide if need to implement serde in script of sql/core There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation here offers a very limited support for A complete implementation (SerDes class for ROW FORMAT DELIMITED) can be added later and will live in the same folder. |
||
inputRowFormat: Seq[(String, String)], | ||
outputRowFormat: Seq[(String, String)], | ||
inputSerdeClass: Option[String], | ||
outputSerdeClass: Option[String], | ||
inputSerdeProps: Seq[(String, String)], | ||
outputSerdeProps: Seq[(String, String)], | ||
recordReaderClass: Option[String], | ||
recordWriterClass: Option[String], | ||
schemaLess: Boolean) extends BaseScriptTransformIOSchema | ||
AngersZhuuuu marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -532,6 +532,21 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { | |
} | ||
} | ||
|
||
object SparkScripts extends Strategy { | ||
def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { | ||
case logical.ScriptTransformation(input, script, output, child, ioschema) | ||
if ioschema.inputSerdeClass.isEmpty && ioschema.outputSerdeClass.isEmpty => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to check this here? Seems like it has been checked in https://github.com/apache/spark/pull/29085/files#diff-9847f5cef7cf7fbc5830fbc6b779ee10R783-R784 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yea, don't need now |
||
SparkScriptTransformationExec( | ||
input, | ||
script, | ||
output, | ||
planLater(child), | ||
SparkScriptIOSchema(ioschema) | ||
) :: Nil | ||
case _ => Nil | ||
} | ||
} | ||
|
||
/** | ||
* This strategy is just for explaining `Dataset/DataFrame` created by `spark.readStream`. | ||
* It won't affect the execution, because `StreamingRelation` will be replaced with | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems like the implementation of
processIterator
is pretty similar to the Hive one. Could we share the code between them more?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea, working on this