Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

350 save xdd extractions when importing publications as documents #2153

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,13 @@ const docLink = computed(() =>
);

const figures = computed(
() => doc.value?.assets?.filter((asset) => asset.assetType === 'figure') || []
() => doc.value?.assets?.filter((asset) => asset.assetType === 'Figure') || []
);
const tables = computed(
() => doc.value?.assets?.filter((asset) => asset.assetType === 'table') || []
() => doc.value?.assets?.filter((asset) => asset.assetType === 'Table') || []
);
const equations = computed(
() => doc.value?.assets?.filter((asset) => asset.assetType === 'equation') || []
() => doc.value?.assets?.filter((asset) => asset.assetType === 'Equation') || []
);

const emit = defineEmits(['close-preview', 'asset-loaded']);
Expand Down
2 changes: 1 addition & 1 deletion packages/client/hmi-client/src/types/Types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ export interface DocumentAsset {
username?: string;
fileNames?: string[];
documentUrl?: string;
metadata?: any;
metadata?: { [index: string]: any };
source?: string;
text?: string;
grounding?: Grounding;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


import com.fasterxml.jackson.databind.JsonNode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
Expand All @@ -13,7 +14,7 @@
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
Expand All @@ -22,46 +23,55 @@
import software.uncharted.terarium.hmiserver.controller.services.DocumentAssetService;
import software.uncharted.terarium.hmiserver.controller.services.DownloadService;
import software.uncharted.terarium.hmiserver.models.dataservice.AssetType;
import software.uncharted.terarium.hmiserver.models.dataservice.document.DocumentExtraction;
import software.uncharted.terarium.hmiserver.models.dataservice.PresignedURL;
import software.uncharted.terarium.hmiserver.models.dataservice.document.AddDocumentAssetFromXDDRequest;
import software.uncharted.terarium.hmiserver.models.dataservice.document.AddDocumentAssetFromXDDResponse;
import software.uncharted.terarium.hmiserver.models.dataservice.document.DocumentAsset;
import software.uncharted.terarium.hmiserver.models.documentservice.Document;
import software.uncharted.terarium.hmiserver.models.documentservice.Extraction;
import software.uncharted.terarium.hmiserver.models.documentservice.responses.XDDExtractionsResponseOK;
import software.uncharted.terarium.hmiserver.models.documentservice.responses.XDDResponse;
import software.uncharted.terarium.hmiserver.proxies.dataservice.DocumentProxy;
import software.uncharted.terarium.hmiserver.proxies.dataservice.ProjectProxy;
import software.uncharted.terarium.hmiserver.proxies.documentservice.ExtractionProxy;
import software.uncharted.terarium.hmiserver.proxies.jsdelivr.JsDelivrProxy;
import software.uncharted.terarium.hmiserver.proxies.knowledge.KnowledgeMiddlewareProxy;
import software.uncharted.terarium.hmiserver.proxies.skema.SkemaUnifiedProxy;

import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;

@RequestMapping("/document-asset")
@RestController
@Slf4j
@RequiredArgsConstructor
public class DocumentController implements SnakeCaseController {

@Autowired
DocumentProxy proxy;
final DocumentProxy proxy;

@Autowired
SkemaUnifiedProxy skemaUnifiedProxy;
final ExtractionProxy extractionProxy;

@Autowired
JsDelivrProxy gitHubProxy;
final SkemaUnifiedProxy skemaUnifiedProxy;

@Autowired
KnowledgeMiddlewareProxy knowledgeMiddlewareProxy;
final JsDelivrProxy gitHubProxy;

@Autowired
ProjectProxy projectProxy;
final DownloadService downloadService;

@Autowired
DocumentAssetService documentAssetService;
final KnowledgeMiddlewareProxy knowledgeMiddlewareProxy;

final ProjectProxy projectProxy;

final DocumentAssetService documentAssetService;

@Value("${xdd.api-key}")
String apikey;

@GetMapping
public ResponseEntity<List<DocumentAsset>> getDocuments(
Expand Down Expand Up @@ -203,78 +213,57 @@ public ResponseEntity<Integer> uploadDocumentFromGithub(
public ResponseEntity<AddDocumentAssetFromXDDResponse> createDocumentFromXDD(
@RequestBody final AddDocumentAssetFromXDDRequest body
) {
try (final CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build()) {

//build initial response
final AddDocumentAssetFromXDDResponse response = new AddDocumentAssetFromXDDResponse();
response.setExtractionJobId(null);
response.setPdfUploadError(false);
response.setDocumentAssetId(null);


// get preliminary info to build document asset
final Document document = body.getDocument();
final String projectId = body.getProjectId();
final String name = document.getTitle();
final String username = projectProxy.getProject(projectId).getBody().getUsername();
final String doi = documentAssetService.getDocumentDoi(document);
final String fileUrl = DownloadService.getPDFURL("https://unpaywall.org/" + doi);
final String filename = DownloadService.pdfNameFromUrl(fileUrl);
final List<String> filenames = new ArrayList<String>();
if(filename != null){
filenames.add(filename);
}

//create document asset
final DocumentAsset documentAsset = new DocumentAsset();
documentAsset.setName(name);
documentAsset.setDescription(name);
documentAsset.setUsername(username);
documentAsset.setFileNames(filenames);
//build initial response
AddDocumentAssetFromXDDResponse response = new AddDocumentAssetFromXDDResponse();
response.setExtractionJobId(null);
response.setPdfUploadError(false);
response.setDocumentAssetId(null);

final String newDocumentAssetId = proxy.createAsset(convertObjectToSnakeCaseJsonNode(documentAsset)).getBody().get("id").asText();
response.setDocumentAssetId(newDocumentAssetId);

//add asset to project
projectProxy.createAsset(projectId, AssetType.documents, newDocumentAssetId);
// get preliminary info to build document asset
Document document = body.getDocument();
String projectId = body.getProjectId();
String doi = documentAssetService.getDocumentDoi(document);
String username = projectProxy.getProject(projectId).getBody().getUsername();

//if there is no filename that means we cannot get the pdf return ok with errors.
if(filename == null || filename.isEmpty()){
response.setPdfUploadError(true);
return ResponseEntity.ok(response);
}
// get pdf url and filename
String fileUrl = null;
String filename = null;
try {
fileUrl = downloadService.getPDFURL("https://unpaywall.org/" + doi);
filename = downloadService.pdfNameFromUrl(fileUrl);
} catch (IOException | URISyntaxException e) {
throw new RuntimeException(e);
}

final byte[] fileAsBytes = DownloadService.getPDF("https://unpaywall.org/" + doi);
XDDResponse<XDDExtractionsResponseOK> extractionResponse = extractionProxy.getExtractions(doi, null, null, null, null, apikey);

//if this service fails, return ok with errors
if(fileAsBytes == null || fileAsBytes.length == 0){
response.setPdfUploadError(true);
return ResponseEntity.ok(response);
}
// create a new document asset from the metadata in the xdd document
DocumentAsset documentAsset = createDocumentAssetFromXDDDocument(document, username, extractionResponse.getSuccess().getData());
if(filename != null)
documentAsset.getFileNames().add(filename);

// upload pdf to document asset
final HttpEntity fileEntity = new ByteArrayEntity(fileAsBytes, ContentType.APPLICATION_OCTET_STREAM);
final PresignedURL presignedURL = proxy.getUploadUrl(newDocumentAssetId, filename).getBody();
final HttpPut put = new HttpPut(presignedURL.getUrl());
put.setEntity(fileEntity);
final HttpResponse pdfUploadResponse = httpclient.execute(put);
// Upload the document to TDS in order to get a new ID to pair our files we want to upload with.
String newDocumentAssetId = proxy.createAsset(convertObjectToSnakeCaseJsonNode(documentAsset)).getBody().get("id").asText();
response.setDocumentAssetId(newDocumentAssetId);

if(pdfUploadResponse.getStatusLine().getStatusCode() >= 400) {
response.setPdfUploadError(true);
return ResponseEntity.ok(response);
}
// Upload the PDF from unpaywall
String extractionJobId = uploadPDFFileToDocumentThenExtract(doi, filename, newDocumentAssetId);
if(extractionJobId == null)
response.setPdfUploadError(true);
else
response.setExtractionJobId(extractionJobId);

// fire and forgot pdf extractions
final String jobId = knowledgeMiddlewareProxy.postPDFToCosmos(newDocumentAssetId).getBody().get("id").asText();
response.setExtractionJobId(jobId);
// Now upload additional extraction files
uploadXDDExtractions(newDocumentAssetId, extractionResponse.getSuccess().getData());

//add asset to project
projectProxy.createAsset(projectId, AssetType.documents, newDocumentAssetId);

return ResponseEntity.ok(response);

return ResponseEntity.ok(response);
} catch (final Exception e) {
log.error("Unable to GET document data", e);
return ResponseEntity.internalServerError().build();
}
}


Expand Down Expand Up @@ -320,5 +309,125 @@ public ResponseEntity<String> getDocumentFileAsText(@PathVariable("id") final St
log.error("Unable to GET document data", e);
return ResponseEntity.internalServerError().build();
}

}

/**
* Creates a document asset from an XDD document
* @param document xdd document
* @param username current user name
* @param extractions list of extractions associated with the document
* @return
*/
private DocumentAsset createDocumentAssetFromXDDDocument(Document document, String username, List<Extraction> extractions) {
String name = document.getTitle();

//create document asset
DocumentAsset documentAsset = new DocumentAsset();
documentAsset.setName(name);
documentAsset.setDescription(name);
documentAsset.setUsername(username);
documentAsset.setFileNames(new ArrayList<>());

if(extractions != null) {
documentAsset.setAssets(new ArrayList<>());
for(int i = 0; i < extractions.size(); i++) {
Extraction extraction = extractions.get(i);
if(extraction.getAskemClass().equalsIgnoreCase(DocumentExtraction.EQUATION_ASSETTYPE)
|| extraction.getAskemClass().equalsIgnoreCase(DocumentExtraction.FIGURE_ASSETTYPE)
|| extraction.getAskemClass().equalsIgnoreCase(DocumentExtraction.TABLE_ASSETTYPE) ) {
DocumentExtraction documentExtraction = new DocumentExtraction().setMetadata(new HashMap<>());
documentExtraction.setAssetType(extraction.getAskemClass());
documentExtraction.setFileName("extraction_" + i + ".png");
documentExtraction.getMetadata().put("title", extraction.getProperties().getTitle());
documentExtraction.getMetadata().put("description", extraction.getProperties().getCaption());
documentAsset.getAssets().add(documentExtraction);
documentAsset.getFileNames().add(documentExtraction.getFileName());
}
}

}

if(document.getGithubUrls() != null && !document.getGithubUrls().isEmpty()){
documentAsset.setMetadata(new HashMap<>());
documentAsset.getMetadata().put("github_urls", document.getGithubUrls());
}

return documentAsset;
}

/**
* Uploads the extractions associated with an XDD document
* @param docId
* @param extractions
*/
private void uploadXDDExtractions(String docId, List<Extraction> extractions){

if(extractions != null) {
for (int i = 0; i < extractions.size(); i++) {
Extraction extraction = extractions.get(i);
if (extraction.getAskemClass().equalsIgnoreCase(DocumentExtraction.EQUATION_ASSETTYPE)
|| extraction.getAskemClass().equalsIgnoreCase(DocumentExtraction.FIGURE_ASSETTYPE)
|| extraction.getAskemClass().equalsIgnoreCase(DocumentExtraction.TABLE_ASSETTYPE)) {
String filename = "extraction_" + i + ".png";

try(CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build()){
byte[] imageAsBytes = Base64.getDecoder().decode(extraction.getProperties().getImage().getBytes("UTF-8"));
HttpEntity fileEntity = new ByteArrayEntity(imageAsBytes, ContentType.APPLICATION_OCTET_STREAM);
final PresignedURL presignedURL = proxy.getUploadUrl(docId, filename).getBody();

final HttpPut put = new HttpPut(presignedURL.getUrl());
put.setEntity(fileEntity);
final HttpResponse pdfUploadResponse = httpclient.execute(put);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
}
}

/**
* Uploads a PDF file to a document asset and then fires and forgets the extraction
* @param doi
* @param filename
* @param docId
* @return
*/
private String uploadPDFFileToDocumentThenExtract(String doi, String filename, String docId){
try(CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build()){

byte[] fileAsBytes = downloadService.getPDF("https://unpaywall.org/" + doi);

//if this service fails, return ok with errors
if(fileAsBytes == null || fileAsBytes.length == 0){
return null;
}

// upload pdf to document asset
HttpEntity fileEntity = new ByteArrayEntity(fileAsBytes, ContentType.APPLICATION_OCTET_STREAM);
final PresignedURL presignedURL = proxy.getUploadUrl(docId, filename).getBody();
final HttpPut put = new HttpPut(presignedURL.getUrl());
put.setEntity(fileEntity);
final HttpResponse pdfUploadResponse = httpclient.execute(put);

if(pdfUploadResponse.getStatusLine().getStatusCode() >= 400) {
return null;
}

// fire and forgot pdf extractions
return knowledgeMiddlewareProxy.postPDFToCosmos(docId).getBody().get("id").asText();

} catch (Exception e){
log.error("Unable to upload PDF document then extract", e);
return null;
}

}


}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import software.uncharted.terarium.hmiserver.annotations.TSModel;
import software.uncharted.terarium.hmiserver.annotations.TSOptional;
import software.uncharted.terarium.hmiserver.models.dataservice.Concept;
import software.uncharted.terarium.hmiserver.models.dataservice.DocumentExtraction;
import software.uncharted.terarium.hmiserver.models.dataservice.Grounding;

import java.time.Instant;
import java.util.HashMap;
import java.util.List;

@Data
Expand Down Expand Up @@ -41,7 +40,7 @@ public class DocumentAsset {
private String documentUrl;

@TSOptional
private Object metadata;
private HashMap<String, Object> metadata;

@TSOptional
private String source;
Expand Down
Loading