Skip to content

Commit

Permalink
BXC-4769 export pdfs from cdm (#115)
Browse files Browse the repository at this point in the history
* add downloading_pdf to export progress states

* add method to downloadPdfFiles

* export unmapped pdf files from different location

* add/modify tests and test resources

* remove unused code

* use object.equals(), modify retrieveSourceFileNameAndEntryTypeField to return both values, add doc_pdf entry type to countIndexedFileObjects

* fix test

* fix pdfsPath

* add doc_pdf entry type to queries

* add cdmid to pdf filename to prevent overwriting index.pdf files
  • Loading branch information
krwong authored Nov 21, 2024
1 parent 11bb596 commit ca98dfe
Show file tree
Hide file tree
Showing 20 changed files with 182 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ public List<String> generateCollectionNumbersList(DestinationMappingOptions opti
+ " from " + CdmIndexService.TB_NAME
+ " where " + " ("+ CdmIndexService.ENTRY_TYPE_FIELD + " != '"
+ CdmIndexService.ENTRY_TYPE_COMPOUND_CHILD + "'" +
" OR " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF + "'" +
" OR " + CdmIndexService.ENTRY_TYPE_FIELD + " is null)");
while (rs.next()) {
if (!rs.getString(1).isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import static edu.unc.lib.boxc.migration.cdm.util.CLIConstants.outputLogger;
Expand Down Expand Up @@ -59,6 +61,7 @@ public String exportUnmapped() throws IOException {
// Have to make reference to connection final so it can be used inside the download block
final var dbConn = conn;
var imageDir = fileRetrievalService.getSshCollectionPath().resolve(CdmFileRetrievalService.IMAGE_SUBPATH);
var pdfDir = fileRetrievalService.getSshCollectionPath().resolve(CdmFileRetrievalService.PDF_SUBPATH);

fileRetrievalService.executeDownloadBlock((scpClient -> {
try {
Expand All @@ -75,9 +78,21 @@ public String exportUnmapped() throws IOException {

currentUnmapped++;
// Figure out name of associated file and download it
var filename = retrieveSourceFileName(dbConn, origMapping);
var filePath = imageDir.resolve(filename).toString();
var fileInfo = retrieveSourceFileNameAndEntryTypeField(dbConn, origMapping);
var entryTypeField = fileInfo.get(1);
String filename;
String filePath;
// Pdf and image cpd objects are located in different places
if (CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF.equals(entryTypeField)) {
// add cdmid to filename to prevent overwriting
filename = origMapping.getCdmId() + "_index.pdf";
filePath = pdfDir.resolve(origMapping.getCdmId() + "/index.pdf").toString();
} else {
filename = fileInfo.get(0);
filePath = imageDir.resolve(filename).toString();
}
var destPath = exportSourceFilesPath.resolve(filename);

try {
scpClient.download(filePath, destPath);
} catch (IOException e) {
Expand Down Expand Up @@ -126,14 +141,18 @@ private int calculateTotalUnmapped(Path originalPath) throws IOException {
}

private static final String FILENAME_QUERY =
"select find from " + CdmIndexService.TB_NAME + " where " + CdmFieldInfo.CDM_ID + " = ?";
"select find, " + CdmIndexService.ENTRY_TYPE_FIELD + " from "
+ CdmIndexService.TB_NAME + " where " + CdmFieldInfo.CDM_ID + " = ?";

private String retrieveSourceFileName(Connection conn, SourceFileMapping mapping) throws SQLException {
private List<String> retrieveSourceFileNameAndEntryTypeField(Connection conn, SourceFileMapping mapping)
throws SQLException {
try (var filenameStmt = conn.prepareStatement(FILENAME_QUERY)) {
filenameStmt.setString(1, mapping.getCdmId());
var resultSet = filenameStmt.executeQuery();
if (resultSet.next()) {
return resultSet.getString(1);
String sourceFilename = resultSet.getString(1);
String entryTypeField = resultSet.getString(2);
return Arrays.asList(sourceFilename, entryTypeField);
} else {
throw new MigrationException("No record found in index for mapped id " + mapping.getCdmId());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ public void exportAll(CdmExportOptions options) throws IOException {
fileRetrievalService.downloadCpdFiles();
project.getProjectProperties().setExportedDate(Instant.now());
ProjectPropertiesSerialization.write(project);
exportStateService.transitionToDownloadingPdf();
}

if (exportStateService.inStateOrNotResuming(ProgressState.DOWNLOADING_PDF)) {
fileRetrievalService.downloadPdfFiles();
project.getProjectProperties().setExportedDate(Instant.now());
ProjectPropertiesSerialization.write(project);
}

exportStateService.exportingCompleted();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public class CdmFileRetrievalService {
private static final String CPD_SUBPATH = IMAGE_SUBPATH + "/*.cpd";
public static final String CPD_EXPORT_PATH = "cpds";
public static final String EXPORTED_SOURCE_FILES_DIR = "source_files";
public static final String PDF_SUBPATH = "supp";
public static final String PDF_EXPORT_SUBPATH = PDF_SUBPATH + "/*/index.pdf";

private String sshUsername;
private String sshPassword;
Expand Down Expand Up @@ -80,6 +82,27 @@ public static Path getExportedCpdsPath(MigrationProject project) {
return project.getExportPath().resolve(CPD_EXPORT_PATH);
}

/**
* Download all pdf cpd files
*/
public void downloadPdfFiles() {
var pdfsPath = getExportedCpdsPath(project);
try {
// Ensure that the PDF folder exists
Files.createDirectories(pdfsPath);
} catch (IOException e) {
throw new MigrationException("Failed to create PDF export directory", e);
}
executeDownloadBlock((scpClient) -> {
var remotePath = getSshCollectionPath().resolve(PDF_EXPORT_SUBPATH).toString();
try {
scpClient.download(remotePath, pdfsPath);
} catch (IOException e) {
throw new MigrationException("Failed to download PDF files", e);
}
});
}

/**
* Perform the provided download operations with a ScpClient
* @param downloadBlock method containing download operations
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package edu.unc.lib.boxc.migration.cdm.services;

import static edu.unc.lib.boxc.common.xml.SecureXMLFactory.createXMLInputFactory;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_FIELD;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_GROUPED_WORK;
import static edu.unc.lib.boxc.model.api.xml.JDOMNamespaceUtil.MODS_V3_NS;

import java.io.ByteArrayInputStream;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ private void generateMultipleGroupMapping(GroupMappingOptions options, Statement
ResultSet groupRs = stmt.executeQuery("select " + multipleGroups
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF + "'"
+ " group by " + multipleGroups
+ " having count(*) > 1");
while (groupRs.next()) {
Expand All @@ -128,6 +129,7 @@ private void generateMultipleGroupMapping(GroupMappingOptions options, Statement
ResultSet rs = stmt.executeQuery("select " + CdmFieldInfo.CDM_ID + ", " + multipleGroups
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF + "'"
+ " order by " + CdmFieldInfo.CDM_ID + " ASC");
while (rs.next()) {
String cdmId = rs.getString(1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_COMPOUND_CHILD;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_COMPOUND_OBJECT;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_FIELD;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.PARENT_ID_FIELD;
import static org.slf4j.LoggerFactory.getLogger;
Expand Down Expand Up @@ -64,6 +65,7 @@ private String buildRemainderQuery(IndexFilteringOptions options) {
return "select count(*) from ( " +
"select dmrecord from " + CdmIndexService.TB_NAME + " where " +
" (" + ENTRY_TYPE_FIELD + " != '" + ENTRY_TYPE_COMPOUND_CHILD + "'" +
" OR " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF + "'" +
" OR " + ENTRY_TYPE_FIELD + " is null)" +
" AND " + queryFilters +
" UNION " +
Expand Down Expand Up @@ -142,6 +144,7 @@ private String buildDeleteObjectsQuery(IndexFilteringOptions options) {
var queryFilters = buildQueryFilters(true, options);
return "delete from " + CdmIndexService.TB_NAME + " where " +
" (" + ENTRY_TYPE_FIELD + " != '" + ENTRY_TYPE_COMPOUND_CHILD + "'" +
" OR " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF + "'" +
" OR " + ENTRY_TYPE_FIELD + " is null)" +
" AND " + queryFilters;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public class PermissionsService {
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_GROUPED_WORK + "'"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_COMPOUND_OBJECT + "'"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF + "'"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " and " + CdmIndexService.PARENT_ID_FIELD + " is null";

Expand Down Expand Up @@ -236,10 +237,11 @@ private List<Map.Entry<String, String>> queryForMappedIds(PermissionMappingOptio
// files
if (options.isWithFiles()) {
// for every file in the project (compound children and grouped children)
// If the entry type is null, the object is a individual cdm object
// If the entry type is doc_pdf or null, the object is a individual cdm object
String fileQuery = "select " + CdmFieldInfo.CDM_ID +
" from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_COMPOUND_CHILD + "'"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF + "'"
+ " or " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " and " + CdmIndexService.PARENT_ID_FIELD + " is not null";
mappedIds = getIds(fileQuery);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ private List<String> listCdmIdsByArchivalCollectionId(String id) {
+ " from " + CdmIndexService.TB_NAME
+ " where " + " ("+ CdmIndexService.ENTRY_TYPE_FIELD + " != '"
+ CdmIndexService.ENTRY_TYPE_COMPOUND_CHILD + "'" +
" OR " + CdmIndexService.ENTRY_TYPE_FIELD + " = '" + CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF + "'" +
" OR " + CdmIndexService.ENTRY_TYPE_FIELD + " is null)" +
" AND " + idField + " = '" + idValue + "'");
while (rs.next()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import java.util.stream.Stream;

import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_COMPOUND_CHILD;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_FIELD;
import static org.slf4j.LoggerFactory.getLogger;

Expand Down Expand Up @@ -174,6 +175,7 @@ protected String buildQuery(GenerateSourceFileMappingOptions options) {
return selectStatement
+ " from " + CdmIndexService.TB_NAME
+ " where " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_COMPOUND_CHILD + "'"
+ " or " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF + "'"
+ " or " + ENTRY_TYPE_FIELD + " is null";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ public void update() {
outputLogger.info("Retrieving compound object files for collection...");
}
}
if (ProgressState.DOWNLOADING_PDF.equals(currentProgress)) {
// Transitioning into download pdfs
if (!ProgressState.DOWNLOADING_PDF.equals(lastUpdateState)) {
outputLogger.info("Retrieving pdf object files for collection...");
}
}
if (ProgressState.EXPORT_COMPLETED.equals(currentProgress)) {
// Transitioning into completed state
if (!ProgressState.EXPORT_COMPLETED.equals(lastUpdateState)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public enum ProgressState {
STARTING,
DOWNLOADING_DESC,
DOWNLOADING_CPD,
DOWNLOADING_PDF,
EXPORT_COMPLETED;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,22 @@ public void transitionToDownloadingCpd() throws IOException {
writeState();
}

/**
* Transition to downloading PDF files
* @throws IOException
*/
public void transitionToDownloadingPdf() throws IOException {
assertState(ProgressState.DOWNLOADING_CPD);
state.setProgressState(ProgressState.DOWNLOADING_PDF);
writeState();
}

/**
* Indicate that the export step has completed
* @throws IOException
*/
public void exportingCompleted() throws IOException {
assertState(ProgressState.DOWNLOADING_CPD);
assertState(ProgressState.DOWNLOADING_PDF);
state.setProgressState(ProgressState.EXPORT_COMPLETED);
writeState();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import edu.unc.lib.boxc.migration.cdm.services.CdmIndexService;

import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_COMPOUND_CHILD;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF;
import static edu.unc.lib.boxc.migration.cdm.services.CdmIndexService.ENTRY_TYPE_FIELD;

/**
Expand Down Expand Up @@ -61,9 +62,10 @@ protected int countIndexedFileObjects() {
indexService.setProject(project);
try (Connection conn = indexService.openDbConnection()) {
Statement stmt = conn.createStatement();
// Query for all file objects. If the entry type is null, the object is a individual cdm object
// Query for all file objects. If the entry type is null or pdf, the object is a individual cdm object
ResultSet rs = stmt.executeQuery("select count(*) from " + CdmIndexService.TB_NAME
+ " where " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_COMPOUND_CHILD + "'"
+ " or " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF + "'"
+ " or " + ENTRY_TYPE_FIELD + " is null");
indexedFileObjectsCountCache = rs.getInt(1);
return indexedFileObjectsCountCache;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,21 @@ public void exportValidProjectWithMonographCompoundsTest() throws Exception {
assertCpdFilePresent(project, "196.cpd", "/descriptions/monograph/image/196.cpd");
}

@Test
public void exportValidProjectWithPdfCompoundsTest() throws Exception {
Path projPath = createProject("pdf");

String[] args = exportArgs(projPath);
executeExpectSuccess(args);

MigrationProject project = MigrationProjectFactory.loadMigrationProject(projPath);

assertTrue(Files.exists(project.getExportPath()), "Export folder not created");
assertDescAllFilePresent(project, "/descriptions/pdf/index/description/desc.all");

assertCpdFilePresent(project, "17941.cpd", "/descriptions/pdf/image/17941.cpd");
}

private void assertDescAllFilePresent(MigrationProject project, String expectedContentPath) throws Exception {
assertEquals(IOUtils.toString(getClass().getResourceAsStream(expectedContentPath), StandardCharsets.UTF_8),
FileUtils.readFileToString(CdmFileRetrievalService.getDescAllPath(project).toFile(), StandardCharsets.UTF_8));
Expand Down
Loading

0 comments on commit ca98dfe

Please sign in to comment.