Skip to content

Commit

Permalink
BXC-4768 ignore pdf children (#114)
Browse files Browse the repository at this point in the history
* ignore children of doc-pdf cpd objects, assign doc-pdf type info

* add test resources

* remove pdf children
  • Loading branch information
krwong authored Nov 8, 2024
1 parent 47892c4 commit 11bb596
Show file tree
Hide file tree
Showing 5 changed files with 626 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.slf4j.Logger;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
Expand All @@ -32,6 +37,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
Expand All @@ -54,6 +60,7 @@ public class CdmIndexService {
public static final String ENTRY_TYPE_GROUPED_WORK = "grouped_work";
public static final String ENTRY_TYPE_COMPOUND_OBJECT = "cpd_object";
public static final String ENTRY_TYPE_COMPOUND_CHILD = "cpd_child";
public static final String ENTRY_TYPE_DOCUMENT_PDF = "doc_pdf";
public static final List<String> MIGRATION_FIELDS = Arrays.asList(
PARENT_ID_FIELD, ENTRY_TYPE_FIELD, CHILD_ORDER_FIELD);
private static final Pattern CONTROL_PATTERN = Pattern.compile("[\\p{Cntrl}&&[^\r\n\t]]");
Expand Down Expand Up @@ -87,6 +94,7 @@ public void indexAll() throws IOException {
recordInsertSqlTemplate = makeInsertTemplate(allFields);

var cpdToIdMap = new HashMap<String, String>();
var pdfIds = new HashSet<String>();

var descAllPath = CdmFileRetrievalService.getDescAllPath(project);
try (
Expand All @@ -102,7 +110,7 @@ public void indexAll() throws IOException {
// reached the end of a record
if (line.contains(CLOSE_CDM_ID_TAG)) {
Document doc = buildDocument(recordBuilder.toString());
// Store details about where info about compound children can be found
// Store details about where info about compound children and pdf objects can be found
recordIfCompoundObject(doc, cpdToIdMap);
indexDocument(doc, conn, fieldInfo);
// reset the record builder for the next record
Expand All @@ -114,8 +122,9 @@ public void indexAll() throws IOException {
throw new MigrationException("Failed to parse desc.all file, incomplete record with body:\n" +
recordBuilder);
}
// Assign type information to objects, based on compound object status
assignObjectTypeDetails(conn, cpdToIdMap);
// Assign type information to objects, based on compound/pdf object status
assignObjectTypeDetails(conn, cpdToIdMap, pdfIds);
assignPdfObjectTypeDetails(conn, pdfIds);
} catch (IOException e) {
throw new MigrationException("Failed to read export files", e);
} catch (SQLException e) {
Expand Down Expand Up @@ -223,13 +232,18 @@ private void recordIfCompoundObject(Document doc, Map<String, String> cpdToIdMap
+ PARENT_ID_FIELD + " = ?,"
+ CHILD_ORDER_FIELD + " = ?"
+ " where " + CdmFieldInfo.CDM_ID + " = ?";
public static final String DELETE_PDF_CHILDREN_TEMPLATE =
"delete from " + TB_NAME + " where " + CdmFieldInfo.CDM_ID + " = ?";
public static final String ASSIGN_PARENT_PDF_TEMPLATE =
"update " + TB_NAME + " set " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF
+ "' where " + CdmFieldInfo.CDM_ID + " = ?";

/**
* Add additional information to records to indicate if they are compound objects or children of one.
* @param dbConn
* @param cpdToIdMap
*/
private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdToIdMap) {
private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdToIdMap, HashSet<String> pdfIds) {
SAXBuilder builder = SecureXMLFactory.createSAXBuilder();
var cpdsPath = CdmFileRetrievalService.getExportedCpdsPath(project);
cpdToIdMap.forEach((cpdFilename, cpdId) -> {
Expand All @@ -246,17 +260,29 @@ private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdT
if (Objects.equals(cpdRoot.getChildTextTrim("type"), "Monograph")) {
childRoot = cpdRoot.getChild("node");
}
// Assign each child object to its parent compound
int orderId = 0;
for (var pageEl : childRoot.getChildren("page")) {
var childId = pageEl.getChildTextTrim("pageptr");
try (var childStmt = dbConn.prepareStatement(ASSIGN_CHILD_INFO_TEMPLATE)) {
childStmt.setString(1, cpdId);
childStmt.setInt(2, orderId);
childStmt.setString(3, childId);
childStmt.executeUpdate();
// Delete children of document-pdf objects
if (Objects.equals(cpdRoot.getChildTextTrim("type"), "Document-PDF")) {
pdfIds.add(cpdId);
for (var pageEl : childRoot.getChildren("page")) {
var childId = pageEl.getChildTextTrim("pageptr");
try (var deleteStmt = dbConn.prepareStatement(DELETE_PDF_CHILDREN_TEMPLATE)) {
deleteStmt.setString(1, childId);
deleteStmt.executeUpdate();
}
}
} else {
// Assign each child object to its parent compound
int orderId = 0;
for (var pageEl : childRoot.getChildren("page")) {
var childId = pageEl.getChildTextTrim("pageptr");
try (var childStmt = dbConn.prepareStatement(ASSIGN_CHILD_INFO_TEMPLATE)) {
childStmt.setString(1, cpdId);
childStmt.setInt(2, orderId);
childStmt.setString(3, childId);
childStmt.executeUpdate();
}
orderId++;
}
orderId++;
}

} catch (FileNotFoundException e) {
Expand All @@ -271,6 +297,23 @@ private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdT
});
}

/**
* Add additional information to records to indicate if they are document-pdf objects
* @param dbConn
* @param pdfIds
*/
private void assignPdfObjectTypeDetails(Connection dbConn, HashSet<String> pdfIds) {
pdfIds.forEach(pdfId -> {
try (var parentTypeStmt = dbConn.prepareStatement(ASSIGN_PARENT_PDF_TEMPLATE)) {
// Assign document-pdf object type to parent object
parentTypeStmt.setString(1, pdfId);
parentTypeStmt.executeUpdate();
} catch (SQLException e) {
throw new MigrationException("Failed to update type information for " + pdfId, e);
}
});
}

private List<String> listFieldValues(Element objEl, List<String> exportFields) {
return exportFields.stream()
.map(exportField -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,47 @@ public void indexExportWithMonographCompoundObjectsTest() throws Exception {
}
}

@Test
public void indexExportWithPdfCompoundObjectsTest() throws Exception {
Files.copy(Paths.get("src/test/resources/descriptions/pdf/index/description/desc.all"),
CdmFileRetrievalService.getDescAllPath(project));
Files.createDirectories(CdmFileRetrievalService.getExportedCpdsPath(project));
Files.copy(Paths.get("src/test/resources/descriptions/pdf/image/17941.cpd"),
CdmFileRetrievalService.getExportedCpdsPath(project).resolve("17941.cpd"));
Files.copy(Paths.get("src/test/resources/pdf_fields.csv"), project.getFieldsPath());
setExportedDate();
CdmIndexOptions options = new CdmIndexOptions();
options.setForce(false);

service.createDatabase(options);
service.indexAll();

assertDateIndexedPresent();
assertRowCount(1);

CdmFieldInfo fieldInfo = fieldService.loadFieldsFromProject(project);
List<String> allFields = fieldInfo.listAllExportFields();
allFields.addAll(CdmIndexService.MIGRATION_FIELDS);

Connection conn = service.openDbConnection();
try {
Statement stmt = conn.createStatement();
var joinedFields = "\"" + String.join("\",\"", allFields) + "\"";
ResultSet rs = stmt.executeQuery("select " + joinedFields
+ " from " + CdmIndexService.TB_NAME + " order by " + CdmFieldInfo.CDM_ID + " asc");
rs.next();
assertEquals(17940, rs.getInt(CdmFieldInfo.CDM_ID));
assertEquals("2014-04-29", rs.getString(CdmFieldInfo.CDM_CREATED));
assertEquals("2014-04-29", rs.getString(CdmFieldInfo.CDM_MODIFIED));
assertEquals("Folder 5: Forum Meetings, January 1992-December 1996: PDF", rs.getString("title"));
assertEquals(CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF, rs.getString(CdmIndexService.ENTRY_TYPE_FIELD));
assertNull(rs.getString(CdmIndexService.PARENT_ID_FIELD));
assertNull(rs.getString(CdmIndexService.CHILD_ORDER_FIELD));
} finally {
CdmIndexService.closeDbConnection(conn);
}
}

@Test
public void indexExportReservedWordFieldTest() throws Exception {
Files.copy(Paths.get("src/test/resources/descriptions/03883/index/description/desc.all"),
Expand Down
19 changes: 19 additions & 0 deletions src/test/resources/descriptions/pdf/image/17941.cpd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="utf-8"?>
<cpd>
<type>Document-PDF</type>
<page>
<pagetitle>Page 1</pagetitle>
<pagefile>17927.pdfpage</pagefile>
<pageptr>17926</pageptr>
</page>
<page>
<pagetitle>Page 2</pagetitle>
<pagefile>17928.pdfpage</pagefile>
<pageptr>17927</pageptr>
</page>
<page>
<pagetitle>Page 3</pagetitle>
<pagefile>17929.pdfpage</pagefile>
<pageptr>17928</pageptr>
</page>
</cpd>
Loading

0 comments on commit 11bb596

Please sign in to comment.