Skip to content

Commit

Permalink
remove table title from tabula extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Vi-dot committed Apr 5, 2018
1 parent 69af68a commit 9ac30b1
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
14 changes: 13 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.grobid.core.engines.Engine;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.GraphicObject;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.BoundingBoxCalculator;
import org.grobid.core.utilities.LayoutTokensUtil;
Expand Down Expand Up @@ -42,6 +43,16 @@ public Table() {
content = new StringBuilder();
label = new StringBuilder();
}

private double headerHeight = 0;

public void setHeaderHeight(double height) {
headerHeight = height;
}

public double getHeaderHeight() {
return headerHeight;
}

@Override
public String toTEI(GrobidAnalysisConfig config) {
Expand Down Expand Up @@ -134,10 +145,11 @@ private String cleanString(String input) {
private String[][] tabulaRes = null;

public void tabulaExtract(File pdfFile) throws IOException {

PDDocument document = PDDocument.load(pdfFile);
ObjectExtractor objectExtractor = new ObjectExtractor(document);
technology.tabula.Page page = objectExtractor.extract(getPage());
technology.tabula.Page pageArea = page.getArea((float)getY(), (float)getX(), (float)(getY()+getHeight()), (float)(getX()+getWidth()));
technology.tabula.Page pageArea = page.getArea((float)(getY()+getHeaderHeight()), (float)getX(), (float)(getY()+getHeight()), (float)(getX()+getWidth()));

BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
technology.tabula.Table table = bea.extract(pageArea).get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ private Table getExtractionResult(List<LayoutToken> tokenizations, String result
} else if (clusterLabel.equals(TBL_CONTENT)) {
table.appendContent(clusterContent);
table.getContentTokens().addAll(tokens);
BoundingBox contentBBox = BoundingBoxCalculator.calculateOneBox(tokens, true);
table.setHeaderHeight(contentBBox.getY()-bbox.getY());
} else {
LOGGER.error("Warning: unexpected table model label - " + clusterLabel + " for " + clusterContent);
}
Expand Down

0 comments on commit 9ac30b1

Please sign in to comment.