-
Notifications
You must be signed in to change notification settings - Fork 707
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REPL: Add toIterator (and related methods) #929
Changes from 24 commits
b768ce4
c2ed7b3
6e365d4
9121d4a
9848c61
5e2801d
b886748
5cb3605
906fd18
81f8088
4962ac0
d88284e
2624481
ccebbb0
37a6c09
9fc9a53
b5ade0e
52fc649
0f16fa2
e442240
64331ba
26d9b69
de55cb3
5aa2a12
326e9ad
ea6971b
0006f73
044e5ca
dcf97e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* | ||
Copyright 2014 Twitter, Inc. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package com.twitter.scalding.source | ||
|
||
import cascading.tuple.Fields | ||
import com.twitter.scalding._ | ||
import com.twitter.scalding.SequenceFile | ||
|
||
/** | ||
* SequenceFile with explicit types. Useful for debugging flows using the Typed API. | ||
* Not to be used for permanent storage: uses Kryo serialization which may not be | ||
* consistent across JVM instances. Use Thrift sources instead. | ||
*/ | ||
class TypedSequenceFile[T](path: String) extends SequenceFile(path, Fields.FIRST) with Mappable[T] with TypedSink[T] { | ||
override def converter[U >: T] = | ||
TupleConverter.asSuperConverter[T, U](TupleConverter.singleConverter[T]) | ||
override def setter[U <: T] = | ||
TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) | ||
} | ||
|
||
object TypedSequenceFile { | ||
def apply[T](path: String): TypedSequenceFile[T] = new TypedSequenceFile[T](path) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,54 +15,105 @@ | |
|
||
package com.twitter.scalding | ||
|
||
import cascading.flow.Flow | ||
import cascading.flow.FlowDef | ||
import cascading.pipe.Pipe | ||
import java.util.UUID | ||
import com.twitter.scalding.ReplImplicits._ | ||
|
||
import cascading.flow.FlowDef | ||
import cascading.tuple.Fields | ||
import com.twitter.scalding.typed._ | ||
import scala.collection.JavaConverters._ | ||
import com.twitter.scalding.source.TypedSequenceFile | ||
|
||
/** | ||
* Enrichment on TypedPipes allowing them to be run locally, independent of the overall flow. | ||
* @param pipe to wrap | ||
*/ | ||
class ShellTypedPipe[T](pipe: TypedPipe[T]) { | ||
import Dsl.flowDefToRichFlowDef | ||
import ReplImplicits._ | ||
|
||
/** | ||
* Shorthand for .write(dest).run | ||
*/ | ||
def save(dest: TypedSink[T] with Mappable[T]): TypedPipe[T] = { | ||
def save(dest: TypedSink[T] with Mappable[T])(implicit fd: FlowDef, md: Mode): TypedPipe[T] = { | ||
|
||
val p = pipe.toPipe(dest.sinkFields)(dest.setter) | ||
|
||
val localFlow = flowDef.onlyUpstreamFrom(p) | ||
dest.writeFrom(p)(localFlow, mode) | ||
run(localFlow) | ||
val localFlow = fd.onlyUpstreamFrom(p) | ||
dest.writeFrom(p)(localFlow, md) | ||
run(localFlow, md) | ||
|
||
TypedPipe.from(dest) | ||
TypedPipe.from(dest)(fd, md) | ||
} | ||
|
||
/** | ||
* Save snapshot of a typed pipe to a temporary sequence file. | ||
* @return A TypedPipe to a new Source, reading from the sequence file. | ||
*/ | ||
def snapshot: TypedPipe[T] = { | ||
|
||
// come up with unique temporary filename | ||
// TODO: refactor into TemporarySequenceFile class | ||
val tmpSeq = "/tmp/scalding-repl/snapshot-" + UUID.randomUUID() + ".seq" | ||
val dest = SequenceFile(tmpSeq, 'record) | ||
val p = pipe.toPipe('record) | ||
|
||
val localFlow = flowDef.onlyUpstreamFrom(p) | ||
dest.writeFrom(p)(localFlow, mode) | ||
run(localFlow) | ||
def snapshot(implicit fd: FlowDef, md: Mode): TypedPipe[T] = { | ||
val p = pipe.toPipe(0) | ||
val localFlow = fd.onlyUpstreamFrom(p) | ||
md match { | ||
case _: CascadingLocal => // Local or Test mode | ||
val dest = new MemorySink[T] | ||
dest.writeFrom(p)(localFlow, md) | ||
run(localFlow, md) | ||
TypedPipe.from(dest.readResults)(fd, md) | ||
case _: HadoopMode => | ||
// come up with unique temporary filename | ||
// TODO: refactor into TemporarySequenceFile class | ||
val tmpSeq = "/tmp/scalding-repl/snapshot-" + UUID.randomUUID + ".seq" | ||
val dest = TypedSequenceFile[T](tmpSeq) | ||
dest.writeFrom(p)(localFlow, md) | ||
run(localFlow, md) | ||
TypedPipe.from(dest)(fd, md) | ||
} | ||
} | ||
|
||
TypedPipe.fromSingleField[T](SequenceFile(tmpSeq)) | ||
/** | ||
* Create a (local) iterator over the pipe. For non-trivial pipes (anything except | ||
* a head-pipe reading from a source), a snapshot is automatically created and | ||
* iterated over. | ||
* @return local iterator | ||
*/ | ||
def toIterator(implicit fd: FlowDef, md: Mode): Iterator[T] = pipe match { | ||
// if this is just a Converter on a head pipe | ||
// (true for the first pipe on a source, e.g. a snapshot pipe) | ||
case TypedPipeInst(p, fields, Converter(conv)) if p.getPrevious.isEmpty => | ||
val srcs = fd.getSources | ||
if (srcs.containsKey(p.getName)) { | ||
val tap = srcs.get(p.getName) | ||
md.openForRead(tap).asScala.map(tup => conv(tup.selectEntry(fields))) | ||
} else { | ||
sys.error("Invalid head: pipe has no previous, but there is no registered source.") | ||
} | ||
// if it's already just a wrapped iterable (MemorySink), just return it | ||
case IterablePipe(iter, _, _) => iter.toIterator | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we add the EmptyPipe case? |
||
// otherwise, snapshot the pipe and get an iterator on that | ||
case _ => | ||
pipe.snapshot.toIterator | ||
} | ||
|
||
// TODO: add back `toList` based on `snapshot` this time | ||
/** | ||
* Create a list from the pipe in memory. Uses `ShellTypedPipe.toIterator`. | ||
* Warning: user must ensure that the results will actually fit in memory. | ||
*/ | ||
def toList(implicit fd: FlowDef, md: Mode): List[T] = toIterator.toList | ||
|
||
// TODO: add `dump` to view contents without reading into memory | ||
/** | ||
* Print the contents of a pipe to stdout. Uses `ShellTypedPipe.toIterator`. | ||
*/ | ||
def dump(implicit fd: FlowDef, md: Mode): Unit = toIterator.foreach(println(_)) | ||
|
||
} | ||
|
||
class ShellValuePipe[T](vp: ValuePipe[T]) { | ||
import ReplImplicits.typedPipeToShellTypedPipe | ||
def toOption(implicit fd: FlowDef, md: Mode): Option[T] = vp match { | ||
case _: EmptyValue => None | ||
case LiteralValue(v) => Some(v) | ||
case ComputedValue(tp) => tp.snapshot.toList match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
is is a bit safer, in the case there is some bug. It will still error, but it won't blow up the memory if there are 2 or more items in the list. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually, why not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is an error to have more than one value in a ValuePipe, right? In the match, should I do a |
||
case Nil => None | ||
case v :: Nil => Some(v) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why aren't these
implicit def flowDef: FlowDef = ...
.I like to minimize the vars. Can we just make the ones in ReplImplicit vars?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, definitely didn't intend to make them "var".