From 69af68af1d805c3b9d598ea54e82338f2cde8a72 Mon Sep 17 00:00:00 2001 From: Vi-dot Date: Wed, 4 Apr 2018 17:57:21 +0200 Subject: [PATCH] tabula to TEI with BasicExtractionAlgorithm --- .../main/java/org/grobid/core/data/Table.java | 63 ++++++++++++++++++- .../grobid/core/engines/FullTextParser.java | 8 ++- .../org/grobid/core/engines/TableParser.java | 27 ++++++-- 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 3232079e66..2d5ee4910c 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -3,6 +3,7 @@ import nu.xom.Attribute; import nu.xom.Element; import org.apache.commons.lang3.StringUtils; +import org.apache.pdfbox.pdmodel.PDDocument; import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.Engine; import org.grobid.core.engines.config.GrobidAnalysisConfig; @@ -13,9 +14,14 @@ import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.engines.counters.TableRejectionCounters; +import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import technology.tabula.*; +import technology.tabula.extractors.BasicExtractionAlgorithm; + /** * Class for representing a table. * @@ -68,8 +74,13 @@ public String toTEI(GrobidAnalysisConfig config) { XmlBuilderUtils.addCoords(descEl, LayoutTokensUtil.getCoordsString(getFullDescriptionTokens())); } - Element contentEl = XmlBuilderUtils.teiElement("table"); - contentEl.appendChild(LayoutTokensUtil.toText(getContentTokens())); + Element contentEl; + if (tabulaRes != null) + contentEl = tabulaResToTEI(); + else { + contentEl = XmlBuilderUtils.teiElement("table"); + contentEl.appendChild(LayoutTokensUtil.toText(getContentTokens())); + } if ((config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("figure"))) { XmlBuilderUtils.addCoords(contentEl, LayoutTokensUtil.getCoordsStringForOneBox(getContentTokens())); } @@ -120,6 +131,54 @@ private String cleanString(String input) { return input.replace("\n", " ").replace(" ", " ").trim(); } + private String[][] tabulaRes = null; + + public void tabulaExtract(File pdfFile) throws IOException { + PDDocument document = PDDocument.load(pdfFile); + ObjectExtractor objectExtractor = new ObjectExtractor(document); + technology.tabula.Page page = objectExtractor.extract(getPage()); + technology.tabula.Page pageArea = page.getArea((float)getY(), (float)getX(), (float)(getY()+getHeight()), (float)(getX()+getWidth())); + + BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); + technology.tabula.Table table = bea.extract(pageArea).get(0); + + + List> tableRows = table.getRows(); + int maxColCount = 0; + for (int i = 0; i < tableRows.size(); i++) { + List row = tableRows.get(i); + if (maxColCount < row.size()) { + maxColCount = row.size(); + } + } + + tabulaRes = new String[tableRows.size()][maxColCount]; + for (int i=0; i row = tableRows.get(i); + for (int j=0; j") || openFigure) { */ private List processTables(String rese, List tokenizations, - Document doc) { + Document doc, + File pdfFile) { List
results = new ArrayList<>(); TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, rese, tokenizations, true); @@ -1785,7 +1786,8 @@ private List
processTables(String rese, List tokenizationTable = cluster.concatTokens(); Table result = parsers.getTableParser().processing( tokenizationTable, - cluster.getFeatureBlock() + cluster.getFeatureBlock(), + pdfFile ); SortedSet blockPtrs = new TreeSet<>(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java index 80dd1d69f0..d33e48031c 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java @@ -5,6 +5,7 @@ import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.tagging.GenericTaggerUtils; import org.grobid.core.exceptions.GrobidException; +import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; @@ -15,6 +16,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.io.IOException; import java.util.Collections; import java.util.List; @@ -33,7 +36,7 @@ protected TableParser() { /** * The processing here is called from the full text parser in cascade. */ - public Table processing(List tokenizationTable, String featureVector) { + public Table processing(List tokenizationTable, String featureVector, File pdfFile) { String res; try { res = label(featureVector); @@ -44,14 +47,20 @@ public Table processing(List tokenizationTable, String featureVecto return null; } // List> labeled = GenericTaggerUtils.getTokensAndLabels(res); - return getExtractionResult(tokenizationTable, res); + return getExtractionResult(tokenizationTable, res, pdfFile); } - private Table getExtractionResult(List tokenizations, String result) { + private Table getExtractionResult(List tokenizations, String result, File pdfFile) { // System.out.println("-----------------"); // System.out.println(result); Table table = new Table(); - table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(tokenizations, true))); + BoundingBox bbox = BoundingBoxCalculator.calculateOneBox(tokenizations, true); + table.setTextArea(Collections.singletonList(bbox)); + table.setPage(bbox.getPage()); + table.setX(bbox.getX()); + table.setY(bbox.getY()); + table.setWidth(bbox.getWidth()); + table.setHeight(bbox.getHeight()); TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.TABLE, result, tokenizations); List clusters = clusteror.cluster(); @@ -86,6 +95,16 @@ private Table getExtractionResult(List tokenizations, String result } } + + if (pdfFile != null) { + try { + table.tabulaExtract(pdfFile); + } catch (IOException e) { + LOGGER.error("Warning: can't extract with tabula - " + e.getMessage()); + } + + } + return table; }