Skip to content

Commit

Permalink
tabula to TEI with BasicExtractionAlgorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
Vi-dot committed Apr 4, 2018
1 parent d9d1a93 commit 69af68a
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 9 deletions.
63 changes: 61 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import nu.xom.Attribute;
import nu.xom.Element;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.engines.Engine;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
Expand All @@ -13,9 +14,14 @@
import org.grobid.core.utilities.counters.CntManager;
import org.grobid.core.engines.counters.TableRejectionCounters;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import technology.tabula.*;
import technology.tabula.extractors.BasicExtractionAlgorithm;

/**
* Class for representing a table.
*
Expand Down Expand Up @@ -68,8 +74,13 @@ public String toTEI(GrobidAnalysisConfig config) {
XmlBuilderUtils.addCoords(descEl, LayoutTokensUtil.getCoordsString(getFullDescriptionTokens()));
}

Element contentEl = XmlBuilderUtils.teiElement("table");
contentEl.appendChild(LayoutTokensUtil.toText(getContentTokens()));
Element contentEl;
if (tabulaRes != null)
contentEl = tabulaResToTEI();
else {
contentEl = XmlBuilderUtils.teiElement("table");
contentEl.appendChild(LayoutTokensUtil.toText(getContentTokens()));
}
if ((config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("figure"))) {
XmlBuilderUtils.addCoords(contentEl, LayoutTokensUtil.getCoordsStringForOneBox(getContentTokens()));
}
Expand Down Expand Up @@ -120,6 +131,54 @@ private String cleanString(String input) {
return input.replace("\n", " ").replace(" ", " ").trim();
}

private String[][] tabulaRes = null;

public void tabulaExtract(File pdfFile) throws IOException {
PDDocument document = PDDocument.load(pdfFile);
ObjectExtractor objectExtractor = new ObjectExtractor(document);
technology.tabula.Page page = objectExtractor.extract(getPage());
technology.tabula.Page pageArea = page.getArea((float)getY(), (float)getX(), (float)(getY()+getHeight()), (float)(getX()+getWidth()));

BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
technology.tabula.Table table = bea.extract(pageArea).get(0);


List<List<RectangularTextContainer>> tableRows = table.getRows();
int maxColCount = 0;
for (int i = 0; i < tableRows.size(); i++) {
List<RectangularTextContainer> row = tableRows.get(i);
if (maxColCount < row.size()) {
maxColCount = row.size();
}
}

tabulaRes = new String[tableRows.size()][maxColCount];
for (int i=0; i<tableRows.size(); i++) {
List<RectangularTextContainer> row = tableRows.get(i);
for (int j=0; j<row.size(); j++) {
tabulaRes[i][j] = table.getCell(i, j).getText();
}
}
}

public Element tabulaResToTEI() {
Element tableEl = XmlBuilderUtils.teiElement("table");

if (tabulaRes != null) {
for (int r=0; r<tabulaRes.length; r++) {
Element rowEl = XmlBuilderUtils.teiElement("tr");

for (int c=0; c<tabulaRes[r].length; c++) {
Element cellEl = XmlBuilderUtils.teiElement("td", tabulaRes[r][c]);
rowEl.appendChild(cellEl);
}

tableEl.appendChild(rowEl);
}
}

return tableEl;
}

// if an extracted table passes some validations rules

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ public Document processing(DocumentSource documentSource,
//LOGGER.info(rese);
// we apply now the figure and table models based on the fulltext labeled output
figures = processFigures(rese, layoutTokenization.getTokenization(), doc);
tables = processTables(rese, layoutTokenization.getTokenization(), doc);
tables = processTables(rese, layoutTokenization.getTokenization(), doc, documentSource.getPdfFile());
equations = processEquations(rese, layoutTokenization.getTokenization(), doc);
} else {
LOGGER.debug("Fulltext model: The featured body is empty");
Expand Down Expand Up @@ -1776,7 +1776,8 @@ else if (label.equals("I-<figure>") || openFigure) {
*/
private List<Table> processTables(String rese,
List<LayoutToken> tokenizations,
Document doc) {
Document doc,
File pdfFile) {
List<Table> results = new ArrayList<>();
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, rese, tokenizations, true);

Expand All @@ -1785,7 +1786,8 @@ private List<Table> processTables(String rese,
List<LayoutToken> tokenizationTable = cluster.concatTokens();
Table result = parsers.getTableParser().processing(
tokenizationTable,
cluster.getFeatureBlock()
cluster.getFeatureBlock(),
pdfFile
);

SortedSet<Integer> blockPtrs = new TreeSet<>();
Expand Down
27 changes: 23 additions & 4 deletions grobid-core/src/main/java/org/grobid/core/engines/TableParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
Expand All @@ -15,6 +16,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;

Expand All @@ -33,7 +36,7 @@ protected TableParser() {
/**
* The processing here is called from the full text parser in cascade.
*/
public Table processing(List<LayoutToken> tokenizationTable, String featureVector) {
public Table processing(List<LayoutToken> tokenizationTable, String featureVector, File pdfFile) {
String res;
try {
res = label(featureVector);
Expand All @@ -44,14 +47,20 @@ public Table processing(List<LayoutToken> tokenizationTable, String featureVecto
return null;
}
// List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(res);
return getExtractionResult(tokenizationTable, res);
return getExtractionResult(tokenizationTable, res, pdfFile);
}

private Table getExtractionResult(List<LayoutToken> tokenizations, String result) {
private Table getExtractionResult(List<LayoutToken> tokenizations, String result, File pdfFile) {
// System.out.println("-----------------");
// System.out.println(result);
Table table = new Table();
table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(tokenizations, true)));
BoundingBox bbox = BoundingBoxCalculator.calculateOneBox(tokenizations, true);
table.setTextArea(Collections.singletonList(bbox));
table.setPage(bbox.getPage());
table.setX(bbox.getX());
table.setY(bbox.getY());
table.setWidth(bbox.getWidth());
table.setHeight(bbox.getHeight());

TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.TABLE, result, tokenizations);
List<TaggingTokenCluster> clusters = clusteror.cluster();
Expand Down Expand Up @@ -86,6 +95,16 @@ private Table getExtractionResult(List<LayoutToken> tokenizations, String result
}

}

if (pdfFile != null) {
try {
table.tabulaExtract(pdfFile);
} catch (IOException e) {
LOGGER.error("Warning: can't extract with tabula - " + e.getMessage());
}

}

return table;
}

Expand Down

0 comments on commit 69af68a

Please sign in to comment.