From 9ac30b18333bea9e5a7a2f8a0b146f78cfe5e6b6 Mon Sep 17 00:00:00 2001 From: Vi-dot Date: Thu, 5 Apr 2018 15:06:44 +0200 Subject: [PATCH] remove table title from tabula extraction --- .../src/main/java/org/grobid/core/data/Table.java | 14 +++++++++++++- .../java/org/grobid/core/engines/TableParser.java | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 2d5ee4910c..2a31228c8e 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -8,6 +8,7 @@ import org.grobid.core.engines.Engine; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.layout.BoundingBox; +import org.grobid.core.layout.GraphicObject; import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.BoundingBoxCalculator; import org.grobid.core.utilities.LayoutTokensUtil; @@ -42,6 +43,16 @@ public Table() { content = new StringBuilder(); label = new StringBuilder(); } + + private double headerHeight = 0; + + public void setHeaderHeight(double height) { + headerHeight = height; + } + + public double getHeaderHeight() { + return headerHeight; + } @Override public String toTEI(GrobidAnalysisConfig config) { @@ -134,10 +145,11 @@ private String cleanString(String input) { private String[][] tabulaRes = null; public void tabulaExtract(File pdfFile) throws IOException { + PDDocument document = PDDocument.load(pdfFile); ObjectExtractor objectExtractor = new ObjectExtractor(document); technology.tabula.Page page = objectExtractor.extract(getPage()); - technology.tabula.Page pageArea = page.getArea((float)getY(), (float)getX(), (float)(getY()+getHeight()), (float)(getX()+getWidth())); + technology.tabula.Page pageArea = page.getArea((float)(getY()+getHeaderHeight()), (float)getX(), (float)(getY()+getHeight()), (float)(getX()+getWidth())); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); technology.tabula.Table table = bea.extract(pageArea).get(0); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java index d33e48031c..6cb0ef27d6 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java @@ -90,6 +90,8 @@ private Table getExtractionResult(List tokenizations, String result } else if (clusterLabel.equals(TBL_CONTENT)) { table.appendContent(clusterContent); table.getContentTokens().addAll(tokens); + BoundingBox contentBBox = BoundingBoxCalculator.calculateOneBox(tokens, true); + table.setHeaderHeight(contentBBox.getY()-bbox.getY()); } else { LOGGER.error("Warning: unexpected table model label - " + clusterLabel + " for " + clusterContent); }