Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: crete doc using unpaywall #2020

Merged
merged 15 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,19 @@

<script setup lang="ts">
import { computed, PropType } from 'vue';
import { isDataset, isModel, isDocument } from '@/utils/data-util';
import { isDataset, isModel, isDocument, getDocumentDoi, pdfNameFromUrl } from '@/utils/data-util';
import { ResultType } from '@/types/common';
import { AssetType, Document, ExternalPublication } from '@/types/Types';
import { AssetType, Document, DocumentAsset } from '@/types/Types';
import dropdown from 'primevue/dropdown';
import Button from 'primevue/button';
import { addDocuments } from '@/services/external';
import { useRouter } from 'vue-router';
import { useProjects } from '@/composables/project';
import { addDocumentFromDOI, createNewDocumentAsset } from '@/services/document-assets';
import { get as getProject } from '@/services/project';
import { getPDFURL } from '@/services/generate-download-link';
import { useToastService } from '@/services/toast';

const toast = useToastService();

const router = useRouter();

Expand All @@ -52,22 +57,42 @@ const addResourcesToProject = async (projectId: string) => {
// send selected items to the store
props.selectedSearchItems.forEach(async (selectedItem) => {
if (isDocument(selectedItem)) {
const body: ExternalPublication = {
xdd_uri: (selectedItem as Document).gddId,
title: (selectedItem as Document).title
const document = selectedItem as Document;
const name = document.title;

// get the project username
const project = await getProject(projectId);
const username = project?.username ?? '';

// get document doi
const doi = getDocumentDoi(document);
if (!doi) return;

// get filename
const fileUrl = await getPDFURL(doi);
const filename = pdfNameFromUrl(fileUrl);

const filenames: string[] = [];
if (filename) filenames.push(filename);
// create document asset
const documentAsset: DocumentAsset = {
name,
description: name,
fileNames: filenames,
username
};

// FIXME: handle cases where assets is already added to the project

// first, insert into the proper table/collection
const res = await addDocuments(body);
if (res) {
const documentId = res.id;

// then, link and store in the project assets
const assetsType = AssetType.Publications;
await useProjects().addAsset(assetsType, documentId, projectId);
const newDocument: DocumentAsset | null = await createNewDocumentAsset(documentAsset);
if (!newDocument || !newDocument.id) return;
// if there is no filename, we will not be able to upload the pdf, show error.
if (filename) {
await addDocumentFromDOI(newDocument.id, doi, filename);
} else {
toast.error('', `Unable to upload PDF for ${name}`);
}

// finally add asset to project
await useProjects().addAsset(AssetType.Documents, newDocument.id, projectId);
}
if (isModel(selectedItem)) {
// FIXME: handle cases where assets is already added to the project
Expand Down
21 changes: 20 additions & 1 deletion packages/client/hmi-client/src/services/document-assets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,30 @@ async function getDocumentFileAsText(documentId: string, fileName: string): Prom
return response.data;
}

async function addDocumentFromDOI(
documentId: string,
doi: string,
filename: string
): Promise<string | null> {
if (!documentId) return null;
const response = await API.put(
`/document-asset/${documentId}/uploadDocumentFromDOI?doi=${doi}&filename=${filename}`
);

if (!response || response.status >= 400) {
logger.error('Error upload file from doi');
return null;
}

return response.data;
}
export {
getAll,
getDocumentAsset,
uploadDocumentAssetToProject,
downloadDocumentAsset,
createNewDocumentFromGithubFile,
getDocumentFileAsText
getDocumentFileAsText,
addDocumentFromDOI,
createNewDocumentAsset
};
2 changes: 1 addition & 1 deletion packages/client/hmi-client/src/services/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ export const profileDataset = async (datasetId: string, documentId: string | nul
return response.data.id;
};

const extractTextFromPDFDocument = async (documentId: string): Promise<string | null> => {
export const extractTextFromPDFDocument = async (documentId: string): Promise<string | null> => {
try {
const response = await API.post(`/knowledge/pdf-to-cosmos?document_id=${documentId}`);
if (response?.status === 200 && response?.data?.id) return response.data.id;
Expand Down
11 changes: 11 additions & 0 deletions packages/client/hmi-client/src/utils/data-util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,14 @@ export function getDocumentDoi(doc: Document | null) {
}
return docIdentifier;
}

export function pdfNameFromUrl(url: string) {
const regex = /\/([^/]+\.pdf)$/i;
const match = url.match(regex);

if (match && match[1]) {
return match[1];
}

return null;
}
blanchco marked this conversation as resolved.
Show resolved Hide resolved
18 changes: 18 additions & 0 deletions packages/client/hmi-client/tests/unit/utils/data-util.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import { pdfNameFromUrl } from '@/utils/data-util';
import { describe, expect, it } from 'vitest';

const pdfUrl = 'https://example.com/documents/document.pdf';
const nonPdfUrl = 'https://example.com/documents/document.notapdf';
YohannParis marked this conversation as resolved.
Show resolved Hide resolved
describe('data util tests', () => {
it('gets pdf url name', () => {
const pdfName = pdfNameFromUrl(pdfUrl);

expect(pdfName).toBe('document.pdf');
});

it('returns null on bad url', () => {
const pdfName = pdfNameFromUrl(nonPdfUrl);

expect(pdfName).toBeNull();
});
});
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package software.uncharted.terarium.hmiserver.controller;

import lombok.extern.slf4j.Slf4j;
import software.uncharted.terarium.hmiserver.controller.services.DownloadService;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
Expand All @@ -9,6 +11,7 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders;
Expand All @@ -30,130 +33,12 @@
@Slf4j
public class DownloadController {

/**
* Normalizes a relative url fragment and a base url to a fully qualified Url
*
* @param relativeUrl the fragment
* @param baseUrl the base url
* @return a fully qualified url
* @throws URISyntaxException
*/
private String normalizeRelativeUrl(final String relativeUrl, final String baseUrl) throws URISyntaxException {
final URI uri = new URI(baseUrl);
return uri.getScheme() + "://" + uri.getHost() + relativeUrl;
}

/**
* Gets a PDF file from a given url
*
* @param url the url location (that may contain redirects)
* @return the pdf file
* @throws IOException
* @throws URISyntaxException
*/
private byte[] getPDF(final String url) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build();

final HttpGet get = new HttpGet(url);
final HttpResponse response = httpclient.execute(get);

// Follow redirects until we actually get a document
if (response.getStatusLine().getStatusCode() >= 300 && response.getStatusLine().getStatusCode() <= 310) {
final String redirect = response.getFirstHeader("Location").getValue();
if (!redirect.startsWith("http")) {
return getPDF(normalizeRelativeUrl(redirect, url));
} else {
return getPDF(redirect);
}
} else {
// We actually have a document, if it's an HTML page with the content, look for
// a link to the pdf itself and follow
// it
final String contentType = response.getEntity().getContentType().getValue();
if (contentType.contains("html")) {
final String html = IOUtils.toString(response.getEntity().getContent(), StandardCharsets.UTF_8);
final Document document = Jsoup.parse(html);
final Elements links = document.select("a");
final String pdfUrl = links.stream()
.map(element -> element.attributes().get("href"))
.map(String::toLowerCase)
.filter(extractedUrl -> extractedUrl.endsWith(".pdf"))
.findFirst().orElse(null);

if (pdfUrl == null) {
return null;
}

if (!pdfUrl.startsWith("http")) {
final URI uri = new URI(url);
return getPDF(uri.getScheme() + "://" + uri.getHost() + pdfUrl);
} else {
return getPDF(pdfUrl);
}
}
}
return IOUtils.toByteArray(response.getEntity().getContent());
}

/**
* Gets a PDF file from a given url
*
* @param url the url location (that may contain redirects)
* @return the pdf file
* @throws IOException
* @throws URISyntaxException
*/
private String getPDFURL(final String url) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build();

final HttpGet get = new HttpGet(url);
final HttpResponse response = httpclient.execute(get);

// Follow redirects until we actually get a document
if (response.getStatusLine().getStatusCode() >= 300 && response.getStatusLine().getStatusCode() <= 310) {
final String redirect = response.getFirstHeader("Location").getValue();
if (!redirect.startsWith("http")) {
return getPDFURL(normalizeRelativeUrl(redirect, url));
} else {
return getPDFURL(redirect);
}
} else {
// We actually have a document, if it's an HTML page with the content, look for
// a link to the pdf itself and follow
// it
final String contentType = response.getEntity().getContentType().getValue();
if (contentType.contains("html")) {
final String html = IOUtils.toString(response.getEntity().getContent(), StandardCharsets.UTF_8);
final Document document = Jsoup.parse(html);
final Elements links = document.select("a");
final String pdfUrl = links.stream()
.map(element -> element.attributes().get("href"))
.map(String::toLowerCase)
.filter(extractedUrl -> extractedUrl.endsWith(".pdf"))
.findFirst().orElse(null);

if (pdfUrl == null) {
return null;
}

if (!pdfUrl.startsWith("http")) {
final URI uri = new URI(url);
return getPDFURL(uri.getScheme() + "://" + uri.getHost() + pdfUrl);
} else {
return getPDFURL(pdfUrl);
}
}
}
return url;
}
@Autowired
blanchco marked this conversation as resolved.
Show resolved Hide resolved
DownloadService downloadService;

@GetMapping
public ResponseEntity<Resource> get(@RequestParam("doi") final String doi) throws IOException, URISyntaxException {
final byte[] pdfBytes = getPDF("https://unpaywall.org/" + doi);
final byte[] pdfBytes = downloadService.getPDF("https://unpaywall.org/" + doi);
if (pdfBytes != null) {

return ResponseEntity.ok()
Expand All @@ -168,7 +53,7 @@ public ResponseEntity<Resource> get(@RequestParam("doi") final String doi) throw

@GetMapping("/url")
public ResponseEntity<String> getURL(@QueryParam("url") final String url) throws IOException, URISyntaxException {
final String pdfLink = getPDFURL("https://unpaywall.org/" + url);
final String pdfLink = downloadService.getPDFURL("https://unpaywall.org/" + url);
if (pdfLink != null) {
return ResponseEntity.ok(pdfLink);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,21 @@
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import software.uncharted.terarium.hmiserver.controller.SnakeCaseController;
import software.uncharted.terarium.hmiserver.controller.services.DownloadService;
import software.uncharted.terarium.hmiserver.models.dataservice.AssetType;
import software.uncharted.terarium.hmiserver.models.dataservice.PresignedURL;
import software.uncharted.terarium.hmiserver.models.dataservice.document.DocumentAsset;
import software.uncharted.terarium.hmiserver.proxies.dataservice.DocumentProxy;
import software.uncharted.terarium.hmiserver.proxies.dataservice.ProjectProxy;
import software.uncharted.terarium.hmiserver.proxies.jsdelivr.JsDelivrProxy;
import software.uncharted.terarium.hmiserver.proxies.knowledge.KnowledgeMiddlewareProxy;

import org.apache.http.entity.StringEntity;
import org.apache.commons.io.IOUtils;
import java.nio.charset.StandardCharsets;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.List;

@RequestMapping("/document-asset")
Expand All @@ -39,6 +45,15 @@ public class DocumentController implements SnakeCaseController {
@Autowired
JsDelivrProxy gitHubProxy;

@Autowired
DownloadService downloadService;

@Autowired
KnowledgeMiddlewareProxy knowledgeMiddlewareProxy;

@Autowired
ProjectProxy projectProxy;

@GetMapping
public ResponseEntity<List<DocumentAsset>> getDocuments(
@RequestParam(name = "page_size", defaultValue = "100", required = false) final Integer pageSize,
Expand Down Expand Up @@ -128,6 +143,31 @@ public ResponseEntity<Integer> uploadDocumentFromGithub(

}

@PutMapping(value = "/{id}/uploadDocumentFromDOI")
public ResponseEntity<Integer> uploadDocumentFromDOI(
@PathVariable("id") String id,
@RequestParam(name = "doi", required = true) String doi,
@RequestParam(name = "filename", required = true) String filename
) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build();

byte[] fileAsBytes = downloadService.getPDF("https://unpaywall.org/" + doi);
YohannParis marked this conversation as resolved.
Show resolved Hide resolved
HttpEntity fileEntity = new ByteArrayEntity(fileAsBytes, ContentType.APPLICATION_OCTET_STREAM);
final PresignedURL presignedURL = proxy.getUploadUrl(id, filename).getBody();
final HttpPut put = new HttpPut(presignedURL.getUrl());
put.setEntity(fileEntity);
final HttpResponse response = httpclient.execute(put);

if(response.getStatusLine().getStatusCode() < 400) {
// fire and forgot pdf extractions
knowledgeMiddlewareProxy.postPDFToCosmos(id);
blanchco marked this conversation as resolved.
Show resolved Hide resolved
}

return ResponseEntity.ok(response.getStatusLine().getStatusCode());
}


@GetMapping(value = "/{id}/downloadDocument", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
public ResponseEntity<byte[]> downloadDocument(
Expand Down
Loading