Skip to content

Commit

Permalink
Checking size in tap first and then calculate by ourselves
Browse files Browse the repository at this point in the history
  • Loading branch information
dieu committed Feb 22, 2017
1 parent 6f11ced commit 5e41f89
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import com.twitter.scalding.commons.datastores.VersionedStore;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
Expand Down Expand Up @@ -90,6 +91,15 @@ public String getSinkPath(JobConf conf) {
}
}

@Override
public long getSize(JobConf conf) throws IOException {
Path path = new Path(getSourcePath(conf));

return getFileSystem(conf)
.getContentSummary(path)
.getLength();
}

@Override
public void sourceConfInit(FlowProcess<JobConf> process, JobConf conf) {
super.sourceConfInit(process, conf);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,15 @@ object Common {
* pattern in its path, so we must be ready to handle that case.
*/
def size(f: Hfs, conf: JobConf): Long = {
val fs = f.getPath.getFileSystem(conf)
fs.globStatus(f.getPath)
.map{ s => fs.getContentSummary(s.getPath).getLength }
.sum
val sizeInBytes = f.getSize(conf)
if (sizeInBytes > 0) {
sizeInBytes
} else {
val fs = f.getPath.getFileSystem(conf)
fs.globStatus(f.getPath)
.map{ s => fs.getContentSummary(s.getPath).getLength }
.sum
}
}

def inputSizes(step: FlowStep[JobConf]): Seq[(String, Long)] = {
Expand Down

0 comments on commit 5e41f89

Please sign in to comment.