Skip to content

Commit

Permalink
feat: crete doc using unpaywall (#2020)
Browse files Browse the repository at this point in the history
Co-authored-by: Cole Blanchard <cblanchard@Coles-MacBook-Pro.local>
Co-authored-by: Yohann Paris <github@yohannparis.com>
  • Loading branch information
3 people authored Oct 17, 2023
1 parent 6a954cc commit 4cd06b9
Show file tree
Hide file tree
Showing 12 changed files with 400 additions and 143 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
import { computed, PropType } from 'vue';
import { isDataset, isModel, isDocument } from '@/utils/data-util';
import { ResultType } from '@/types/common';
import { AssetType, Document, ExternalPublication } from '@/types/Types';
import { AssetType, Document } from '@/types/Types';
import dropdown from 'primevue/dropdown';
import Button from 'primevue/button';
import { addDocuments } from '@/services/external';
import { useRouter } from 'vue-router';
import { useProjects } from '@/composables/project';
import { createDocumentFromXDD } from '@/services/document-assets';
const router = useRouter();
Expand All @@ -52,22 +52,10 @@ const addResourcesToProject = async (projectId: string) => {
// send selected items to the store
props.selectedSearchItems.forEach(async (selectedItem) => {
if (isDocument(selectedItem)) {
const body: ExternalPublication = {
xdd_uri: (selectedItem as Document).gddId,
title: (selectedItem as Document).title
};
// FIXME: handle cases where assets is already added to the project
// first, insert into the proper table/collection
const res = await addDocuments(body);
if (res) {
const documentId = res.id;
// then, link and store in the project assets
const assetsType = AssetType.Publications;
await useProjects().addAsset(assetsType, documentId, projectId);
}
const document = selectedItem as Document;
await createDocumentFromXDD(document, projectId);
// finally add asset to project
await useProjects().get(projectId);
}
if (isModel(selectedItem)) {
// FIXME: handle cases where assets is already added to the project
Expand Down
23 changes: 21 additions & 2 deletions packages/client/hmi-client/src/services/document-assets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*/

import API from '@/api/api';
import { DocumentAsset } from '@/types/Types';
import { AddDocumentAssetFromXDDResponse, Document, DocumentAsset } from '@/types/Types';
import { logger } from '@/utils/logger';
import { Ref } from 'vue';
/**
Expand Down Expand Up @@ -171,11 +171,30 @@ async function getDocumentFileAsText(documentId: string, fileName: string): Prom
return response.data;
}

async function createDocumentFromXDD(
document: Document,
projectId: string
): Promise<AddDocumentAssetFromXDDResponse | null> {
if (!document || !projectId) return null;
const response = await API.post(`/document-asset/createDocumentFromXDD`, {
document,
projectId
});

if (!response || response.status >= 400) {
logger.error('Error upload file from doi');
return null;
}

return response.data;
}
export {
getAll,
getDocumentAsset,
uploadDocumentAssetToProject,
downloadDocumentAsset,
createNewDocumentFromGithubFile,
getDocumentFileAsText
getDocumentFileAsText,
createDocumentFromXDD,
createNewDocumentAsset
};
2 changes: 1 addition & 1 deletion packages/client/hmi-client/src/services/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ export const profileDataset = async (datasetId: string, documentId: string | nul
return response.data.id;
};

const extractTextFromPDFDocument = async (documentId: string): Promise<string | null> => {
export const extractTextFromPDFDocument = async (documentId: string): Promise<string | null> => {
try {
const response = await API.post(`/knowledge/pdf-to-cosmos?document_id=${documentId}`);
if (response?.status === 200 && response?.data?.id) return response.data.id;
Expand Down
11 changes: 11 additions & 0 deletions packages/client/hmi-client/src/types/Types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,17 @@ export interface DatasetColumn {
description?: string;
}

export interface AddDocumentAssetFromXDDRequest {
document: Document;
projectId: string;
}

export interface AddDocumentAssetFromXDDResponse {
documentAssetId: string;
pdfUploadError: boolean;
extractionJobId: string;
}

export interface DocumentAsset {
id?: string;
name?: string;
Expand Down
11 changes: 11 additions & 0 deletions packages/client/hmi-client/src/utils/data-util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,14 @@ export function getDocumentDoi(doc: Document | null) {
}
return docIdentifier;
}

export function pdfNameFromUrl(url: string): string | null {
const urlWithoutParams = url.split('?')[0]; // Remove query parameters
const regex = /\/([^/]+\.pdf)$/i;
const match = urlWithoutParams.match(regex);

if (match && match[1]) {
return match[1];
}
return null;
}
25 changes: 25 additions & 0 deletions packages/client/hmi-client/tests/unit/utils/data-util.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { pdfNameFromUrl } from '@/utils/data-util';
import { describe, expect, it } from 'vitest';

const pdfUrl = 'https://example.com/documents/document.pdf';
const nonPdfUrl = 'https://example.com/documents/document.notapdf';
const pdfUrlWithParams = 'https://example.com/documents/document.pdf?q=12344e';
describe('data util tests', () => {
it('gets pdf url name', () => {
const pdfName = pdfNameFromUrl(pdfUrl);

expect(pdfName).toBe('document.pdf');
});

it('gets pdf url name with params', () => {
const pdfName = pdfNameFromUrl(pdfUrlWithParams);

expect(pdfName).toBe('document.pdf');
});

it('returns null on bad url', () => {
const pdfName = pdfNameFromUrl(nonPdfUrl);

expect(pdfName).toBeNull();
});
});
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package software.uncharted.terarium.hmiserver.controller;

import lombok.extern.slf4j.Slf4j;
import software.uncharted.terarium.hmiserver.controller.services.DownloadService;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
Expand All @@ -9,6 +11,7 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders;
Expand All @@ -30,130 +33,12 @@
@Slf4j
public class DownloadController {

/**
* Normalizes a relative url fragment and a base url to a fully qualified Url
*
* @param relativeUrl the fragment
* @param baseUrl the base url
* @return a fully qualified url
* @throws URISyntaxException
*/
private String normalizeRelativeUrl(final String relativeUrl, final String baseUrl) throws URISyntaxException {
final URI uri = new URI(baseUrl);
return uri.getScheme() + "://" + uri.getHost() + relativeUrl;
}

/**
* Gets a PDF file from a given url
*
* @param url the url location (that may contain redirects)
* @return the pdf file
* @throws IOException
* @throws URISyntaxException
*/
private byte[] getPDF(final String url) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build();

final HttpGet get = new HttpGet(url);
final HttpResponse response = httpclient.execute(get);

// Follow redirects until we actually get a document
if (response.getStatusLine().getStatusCode() >= 300 && response.getStatusLine().getStatusCode() <= 310) {
final String redirect = response.getFirstHeader("Location").getValue();
if (!redirect.startsWith("http")) {
return getPDF(normalizeRelativeUrl(redirect, url));
} else {
return getPDF(redirect);
}
} else {
// We actually have a document, if it's an HTML page with the content, look for
// a link to the pdf itself and follow
// it
final String contentType = response.getEntity().getContentType().getValue();
if (contentType.contains("html")) {
final String html = IOUtils.toString(response.getEntity().getContent(), StandardCharsets.UTF_8);
final Document document = Jsoup.parse(html);
final Elements links = document.select("a");
final String pdfUrl = links.stream()
.map(element -> element.attributes().get("href"))
.map(String::toLowerCase)
.filter(extractedUrl -> extractedUrl.endsWith(".pdf"))
.findFirst().orElse(null);

if (pdfUrl == null) {
return null;
}

if (!pdfUrl.startsWith("http")) {
final URI uri = new URI(url);
return getPDF(uri.getScheme() + "://" + uri.getHost() + pdfUrl);
} else {
return getPDF(pdfUrl);
}
}
}
return IOUtils.toByteArray(response.getEntity().getContent());
}

/**
* Gets a PDF file from a given url
*
* @param url the url location (that may contain redirects)
* @return the pdf file
* @throws IOException
* @throws URISyntaxException
*/
private String getPDFURL(final String url) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.custom()
.disableRedirectHandling()
.build();

final HttpGet get = new HttpGet(url);
final HttpResponse response = httpclient.execute(get);

// Follow redirects until we actually get a document
if (response.getStatusLine().getStatusCode() >= 300 && response.getStatusLine().getStatusCode() <= 310) {
final String redirect = response.getFirstHeader("Location").getValue();
if (!redirect.startsWith("http")) {
return getPDFURL(normalizeRelativeUrl(redirect, url));
} else {
return getPDFURL(redirect);
}
} else {
// We actually have a document, if it's an HTML page with the content, look for
// a link to the pdf itself and follow
// it
final String contentType = response.getEntity().getContentType().getValue();
if (contentType.contains("html")) {
final String html = IOUtils.toString(response.getEntity().getContent(), StandardCharsets.UTF_8);
final Document document = Jsoup.parse(html);
final Elements links = document.select("a");
final String pdfUrl = links.stream()
.map(element -> element.attributes().get("href"))
.map(String::toLowerCase)
.filter(extractedUrl -> extractedUrl.endsWith(".pdf"))
.findFirst().orElse(null);

if (pdfUrl == null) {
return null;
}

if (!pdfUrl.startsWith("http")) {
final URI uri = new URI(url);
return getPDFURL(uri.getScheme() + "://" + uri.getHost() + pdfUrl);
} else {
return getPDFURL(pdfUrl);
}
}
}
return url;
}
@Autowired
DownloadService downloadService;

@GetMapping
public ResponseEntity<Resource> get(@RequestParam("doi") final String doi) throws IOException, URISyntaxException {
final byte[] pdfBytes = getPDF("https://unpaywall.org/" + doi);
final byte[] pdfBytes = downloadService.getPDF("https://unpaywall.org/" + doi);
if (pdfBytes != null) {

return ResponseEntity.ok()
Expand All @@ -168,7 +53,7 @@ public ResponseEntity<Resource> get(@RequestParam("doi") final String doi) throw

@GetMapping("/url")
public ResponseEntity<String> getURL(@QueryParam("url") final String url) throws IOException, URISyntaxException {
final String pdfLink = getPDFURL("https://unpaywall.org/" + url);
final String pdfLink = downloadService.getPDFURL("https://unpaywall.org/" + url);
if (pdfLink != null) {
return ResponseEntity.ok(pdfLink);
}
Expand Down
Loading

0 comments on commit 4cd06b9

Please sign in to comment.