Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

4.6 tweaks for harvesting from partners #3521

Merged
merged 6 commits into from
Dec 9, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public class ImportDDIServiceBean {
public static final String NAMING_PROTOCOL_DOI = "doi";
public static final String AGENCY_HANDLE = "handle";
public static final String AGENCY_DOI = "DOI";
public static final String AGENCY_DARA = "dara"; // da|ra - http://www.da-ra.de/en/home/
public static final String REPLICATION_FOR_TYPE = "replicationFor";
public static final String VAR_WEIGHTED = "wgtd";
public static final String VAR_INTERVAL_CONTIN = "contin";
Expand Down Expand Up @@ -91,6 +92,7 @@ public class ImportDDIServiceBean {
public static final String NOTE_SUBJECT_LOCKSS_PERM = "LOCKSS Permission";

public static final String NOTE_TYPE_REPLICATION_FOR = "DVN:REPLICATION_FOR";
private static final String HARVESTED_FILE_STORAGE_PREFIX = "http://";
private XMLInputFactory xmlInputFactory = null;

@EJB CustomFieldServiceBean customFieldService;
Expand Down Expand Up @@ -241,18 +243,28 @@ private void processCodeBook(ImportType importType, XMLStreamReader xmlr, Datase
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("docDscr")) {
processDocDscr(xmlr, datasetDTO);
}
else if (xmlr.getLocalName().equals("stdyDscr")) {
} else if (xmlr.getLocalName().equals("stdyDscr")) {
processStdyDscr(importType, xmlr, datasetDTO);
}
else if (xmlr.getLocalName().equals("fileDscr") && !isMigrationImport(importType)) {
} else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) {
processOtherMat(xmlr, datasetDTO, filesMap);
} else if (xmlr.getLocalName().equals("fileDscr") && isHarvestWithFilesImport(importType)) {
// If this is a harvesting import, we'll attempt to extract some minimal
// file-level metadata information from the fileDscr sections as well.
// TODO: add more info here... -- 4.6
processFileDscrMinimal(xmlr, datasetDTO, filesMap);
} else if (xmlr.getLocalName().equals("fileDscr") && isNewImport(importType)) {
// this is a "full" fileDscr section - Dataverses use it
// to encode *tabular* files only. It will contain the information
// about variables, observations, etc. It will be complemented
// by a number of <var> entries in the dataDscr section.
// Dataverses do not use this section for harvesting exports, since
// we don't harvest tabular metadata. And all the "regular"
// file-level metadata is encoded in otherMat sections.
// The goal is to one day be able to import such tabular
// metadata using the direct (non-harvesting) import API.
// EMK TODO: add this back in for ImportType.NEW
//processFileDscr(xmlr, datasetDTO, filesMap);

}
else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) {
processOtherMat(xmlr, datasetDTO, filesMap);
}
}

} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("codeBook")) return;
Expand Down Expand Up @@ -432,12 +444,23 @@ else if (xmlr.getLocalName().equals("relStdy")) {
private void processCitation(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException, ImportException {
DatasetVersionDTO dvDTO = datasetDTO.getDatasetVersion();
MetadataBlockDTO citation=datasetDTO.getDatasetVersion().getMetadataBlocks().get("citation");
boolean distStatementProcessed = false;
for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("titlStmt")) processTitlStmt(xmlr, datasetDTO);
else if (xmlr.getLocalName().equals("rspStmt")) processRspStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("prodStmt")) processProdStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("distStmt")) processDistStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("distStmt")) {
if (distStatementProcessed) {
// We've already encountered one Distribution Statement in
// this citation, we'll just skip any consecutive ones.
// This is a defensive check against duplicate distStmt
// in some DDIs (notably, from ICPSR)
} else {
processDistStmt(xmlr,citation);
distStatementProcessed = true;
}
}
else if (xmlr.getLocalName().equals("serStmt")) processSerStmt(xmlr,citation);
else if (xmlr.getLocalName().equals("verStmt")) processVerStmt(importType, xmlr,dvDTO);
else if (xmlr.getLocalName().equals("notes")) {
Expand Down Expand Up @@ -882,11 +905,23 @@ private void processAnlyInfo(XMLStreamReader xmlr, MetadataBlockDTO socialScienc

private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException {
MetadataBlockDTO socialScience =getSocialScience(dvDTO);

String collMode = "";
String timeMeth = "";
String weight = "";

for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
//timeMethod
if (xmlr.getLocalName().equals("timeMeth")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" )));
String thisValue = parseText( xmlr, "timeMeth" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(timeMeth)) {
timeMeth = timeMeth.concat(", ");
}
timeMeth = timeMeth.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" )));
} else if (xmlr.getLocalName().equals("dataCollector")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("dataCollector", parseText( xmlr, "dataCollector" )));
// frequencyOfDataCollection
Expand All @@ -903,7 +938,14 @@ private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) thro
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("deviationsFromSampleDesign", parseText( xmlr, "deviat" )));
// collectionMode
} else if (xmlr.getLocalName().equals("collMode")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" )));
String thisValue = parseText( xmlr, "collMode" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(collMode)) {
collMode = collMode.concat(", ");
}
collMode = collMode.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" )));
//researchInstrument
} else if (xmlr.getLocalName().equals("resInstru")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("researchInstrument", parseText( xmlr, "resInstru" )));
Expand All @@ -916,12 +958,30 @@ private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) thro
} else if (xmlr.getLocalName().equals("ConOps")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("controlOperations", parseText( xmlr, "ConOps" )));
} else if (xmlr.getLocalName().equals("weight")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" )));
String thisValue = parseText( xmlr, "weight" );
if (!StringUtil.isEmpty(thisValue)) {
if (!"".equals(weight)) {
weight = weight.concat(", ");
}
weight = weight.concat(thisValue);
}
//socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" )));
} else if (xmlr.getLocalName().equals("cleanOps")) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("cleaningOperations", parseText( xmlr, "cleanOps" )));
}
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("dataColl")) return;
if (xmlr.getLocalName().equals("dataColl")) {
if (!StringUtil.isEmpty(timeMeth)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", timeMeth));
}
if (!StringUtil.isEmpty(collMode)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", collMode));
}
if (!StringUtil.isEmpty(weight)) {
socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", weight));
}
return;
}
}
}
}
Expand Down Expand Up @@ -1242,6 +1302,16 @@ private void processTitlStmt(XMLStreamReader xmlr, DatasetDTO datasetDTO) throws
parseStudyIdHandle( parseText(xmlr), datasetDTO );
} else if ( AGENCY_DOI.equals( xmlr.getAttributeValue(null, "agency") ) ) {
parseStudyIdDOI( parseText(xmlr), datasetDTO );
} else if ( AGENCY_DARA.equals( xmlr.getAttributeValue(null, "agency"))) {
/*
da|ra - "Registration agency for social and economic data"
(http://www.da-ra.de/en/home/)
ICPSR uses da|ra to register their DOIs; so they have agency="dara"
in their IDNo entries.
Also, their DOIs are formatted differently, without the
hdl: prefix.
*/
parseStudyIdDoiICPSRdara( parseText(xmlr), datasetDTO );
} else {
HashSet<FieldDTO> set = new HashSet<>();
addToSet(set,"otherIdAgency", xmlr.getAttributeValue(null, "agency"));
Expand Down Expand Up @@ -1325,16 +1395,23 @@ private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStrea
if (event == XMLStreamConstants.CHARACTERS) {
returnString += xmlr.getText().trim().replace('\n',' ');
} else if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("p")) {
returnString += "<p>" + parseText(xmlr, "p") + "</p>";
} else if (xmlr.getLocalName().equals("emph")) {
returnString += "<em>" + parseText(xmlr, "emph") + "</em>";
} else if (xmlr.getLocalName().equals("hi")) {
returnString += "<strong>" + parseText(xmlr, "hi") + "</strong>";
if (xmlr.getLocalName().equals("p") || xmlr.getLocalName().equals("br") || xmlr.getLocalName().equals("head")) {
returnString += "<p>" + parseText(xmlr, xmlr.getLocalName()) + "</p>";
} else if (xmlr.getLocalName().equals("emph") || xmlr.getLocalName().equals("em") || xmlr.getLocalName().equals("i")) {
returnString += "<em>" + parseText(xmlr, xmlr.getLocalName()) + "</em>";
} else if (xmlr.getLocalName().equals("hi") || xmlr.getLocalName().equals("b")) {
returnString += "<strong>" + parseText(xmlr, xmlr.getLocalName()) + "</strong>";
} else if (xmlr.getLocalName().equals("ExtLink")) {
String uri = xmlr.getAttributeValue(null, "URI");
String text = parseText(xmlr, "ExtLink").trim();
returnString += "<a href=\"" + uri + "\">" + ( StringUtil.isEmpty(text) ? uri : text) + "</a>";
} else if (xmlr.getLocalName().equals("a") || xmlr.getLocalName().equals("A")) {
String uri = xmlr.getAttributeValue(null, "URI");
if (StringUtil.isEmpty(uri)) {
uri = xmlr.getAttributeValue(null, "HREF");
}
String text = parseText(xmlr, xmlr.getLocalName()).trim();
returnString += "<a href=\"" + uri + "\">" + ( StringUtil.isEmpty(text) ? uri : text) + "</a>";
} else if (xmlr.getLocalName().equals("list")) {
returnString += parseText_list(xmlr);
} else if (xmlr.getLocalName().equals("citation")) {
Expand All @@ -1343,6 +1420,8 @@ private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStrea
} else {
returnString += parseText_citation(xmlr);
}
} else if (xmlr.getLocalName().equals("txt")) {
returnString += parseText(xmlr);
} else {
throw new EJBException("ERROR occurred in mapDDI (parseText): tag not yet supported: <" + xmlr.getLocalName() + ">" );
}
Expand Down Expand Up @@ -1373,7 +1452,7 @@ private String parseText_list (XMLStreamReader xmlr) throws XMLStreamException {

// check type
String listType = xmlr.getAttributeValue(null, "type");
if ("bulleted".equals(listType) ){
if ("bulleted".equals(listType) || listType == null){
listString = "<ul>\n";
listCloseTag = "</ul>";
} else if ("ordered".equals(listType) ) {
Expand Down Expand Up @@ -1524,6 +1603,31 @@ private void parseStudyIdDOI(String _id, DatasetDTO datasetDTO) throws ImportExc

datasetDTO.setIdentifier(_id.substring(index2+1));
}

private void parseStudyIdDoiICPSRdara(String _id, DatasetDTO datasetDTO) throws ImportException{
/*
dara/ICPSR DOIs are formatted without the hdl: prefix; for example -
10.3886/ICPSR06635.v1
so we assume that everything before the last "/" is the authority,
and everything past it - the identifier:
*/

int index = _id.lastIndexOf('/');

if (index == -1) {
throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+". '/' not found in string");
}

if (index == _id.length() - 1) {
throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+" ends with '/'");
}

datasetDTO.setAuthority(_id.substring(0, index));
datasetDTO.setProtocol("doi");
datasetDTO.setDoiSeparator("/");

datasetDTO.setIdentifier(_id.substring(index+1));
}
// Helper methods
private MetadataBlockDTO getCitation(DatasetVersionDTO dvDTO) {
return dvDTO.getMetadataBlocks().get("citation");
Expand Down Expand Up @@ -1609,6 +1713,58 @@ private void processOtherMat(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map fi
}
}

// this method is for attempting to extract the minimal amount of file-level
// metadata from an ICPSR-supplied DDI. (they use the "fileDscr" instead of
// "otherMat" for general file metadata; the only field they populate is
// "fileName". -- 4.6

private void processFileDscrMinimal(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();

if (datasetDTO.getDatasetVersion().getFileMetadatas() == null) {
datasetDTO.getDatasetVersion().setFileMetadatas(new ArrayList<>());
}
datasetDTO.getDatasetVersion().getFileMetadatas().add(fmdDTO);

DataFileDTO dfDTO = new DataFileDTO();
dfDTO.setContentType("data/various-formats"); // reserved ICPSR content type identifier
fmdDTO.setDataFile(dfDTO);

for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
if (event == XMLStreamConstants.START_ELEMENT) {
if (xmlr.getLocalName().equals("fileName")) {
// this is the file name:
String label = parseText(xmlr);
// do some cleanup:
int col = label.lastIndexOf(':');
if ( col > -1) {
if (col < label.length() - 1) {
label = label.substring(col+1);
} else {
label = label.replaceAll(":", "");
}
}
label = label.replaceAll("[#;<>\\?\\|\\*\"]", "");
label = label.replaceAll("/", "-");
// strip leading blanks:
label = label.replaceFirst("^[ \t]*", "");
fmdDTO.setLabel(label);
}
} else if (event == XMLStreamConstants.END_ELEMENT) {
if (xmlr.getLocalName().equals("fileDscr")) {
if (fmdDTO.getLabel() == null || fmdDTO.getLabel().trim().equals("") ) {
fmdDTO.setLabel("harvested file");
}
if (StringUtil.isEmpty(fmdDTO.getDataFile().getStorageIdentifier())) {
fmdDTO.getDataFile().setStorageIdentifier(HARVESTED_FILE_STORAGE_PREFIX);
}

return;
}
}
}
}

private void processFileDscr(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException {
FileMetadataDTO fmdDTO = new FileMetadataDTO();

Expand Down
Loading