Skip to content

Commit

Permalink
BXC-4603 group by multiple fields (#95)
Browse files Browse the repository at this point in the history
* group by more than one field

* fix group by more than one field

* remove unused code

* rename groupField to groupFields, remove generateOneGroupMapping, join group values in multiMemberGroupSet, other small fixes

* add helper method to build key, simplify number of cases in if statement

* check all entries in list are empty

* split groupFields, fix test, add sync multiple group test

* add another sync multiple group test
  • Loading branch information
krwong authored Jun 18, 2024
1 parent 6dc2fad commit e8607fe
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.nio.file.Path;

import edu.unc.lib.boxc.migration.cdm.options.GroupMappingSyncOptions;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;

import edu.unc.lib.boxc.migration.cdm.exceptions.MigrationException;
Expand Down Expand Up @@ -120,7 +119,7 @@ private void initialize() throws IOException {
}

private void validateOptions(GroupMappingOptions options) {
if (StringUtils.isBlank(options.getGroupField())) {
if (options.getGroupFields().isEmpty()) {
throw new IllegalArgumentException("Must provide an group field name");
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import picocli.CommandLine.Option;

import java.util.List;

/**
* Options for generating object grouping mappings
*
Expand All @@ -10,10 +12,11 @@
public class GroupMappingOptions {

@Option(names = {"-n", "--field-name"},
split = ",",
description = {
"Name of the CDM export field to perform grouping on."},
"Name(s) of the CDM export field to perform grouping on."},
defaultValue = "file")
private String groupField;
private List<String> groupFields;

@Option(names = {"-u", "--update"},
description = {
Expand All @@ -31,12 +34,12 @@ public class GroupMappingOptions {
description = "Overwrite mapping file if one already exists")
private boolean force;

public String getGroupField() {
return groupField;
public List<String> getGroupFields() {
return groupFields;
}

public void setGroupField(String groupField) {
this.groupField = groupField;
public void setGroupFields(List<String> groupFields) {
this.groupFields = groupFields;
}

public boolean getUpdate() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class GroupMappingService {
private CdmFieldService fieldService;
private List<String> exportFields;

public void generateMapping(GroupMappingOptions options) throws IOException {
public void generateMapping(GroupMappingOptions options) throws Exception {
assertProjectStateValid();
ensureMappingState(options);

Expand Down Expand Up @@ -88,37 +88,7 @@ public void generateMapping(GroupMappingOptions options) throws IOException {

// Return set of all group keys that have at least 2 records in them
var multiMemberGroupSet = new HashSet<String>();
ResultSet groupRs = stmt.executeQuery("select " + options.getGroupField()
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " group by " + options.getGroupField()
+ " having count(*) > 1");
while (groupRs.next()) {
var groupValue = groupRs.getString(1);
if (StringUtils.isBlank(groupValue)) {
continue;
}
multiMemberGroupSet.add(groupValue);
}

ResultSet rs = stmt.executeQuery("select " + CdmFieldInfo.CDM_ID + ", " + options.getGroupField()
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " order by " + CdmFieldInfo.CDM_ID + " ASC");
while (rs.next()) {
String cdmId = rs.getString(1);
String matchedValue = rs.getString(2);

// Add empty mapping for records either not in groups or in groups with fewer than 2 members
if (StringUtils.isBlank(matchedValue) || !multiMemberGroupSet.contains(matchedValue)) {
log.debug("No matching field for object {}", cdmId);
csvPrinter.printRecord(cdmId, null);
continue;
}

String groupKey = GroupMappingInfo.GROUPED_WORK_PREFIX + options.getGroupField() + ":" + matchedValue;
csvPrinter.printRecord(cdmId, groupKey);
}
generateMultipleGroupMapping(options, stmt, multiMemberGroupSet, csvPrinter);
} catch (SQLException e) {
throw new MigrationException("Error interacting with export index", e);
} finally {
Expand All @@ -135,6 +105,55 @@ public void generateMapping(GroupMappingOptions options) throws IOException {
}
}

private void generateMultipleGroupMapping(GroupMappingOptions options, Statement stmt,
Set<String> multiMemberGroupSet, CSVPrinter csvPrinter) throws Exception {
int numberGroups = options.getGroupFields().size();
String multipleGroups = String.join(", ", options.getGroupFields());

ResultSet groupRs = stmt.executeQuery("select " + multipleGroups
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " group by " + multipleGroups
+ " having count(*) > 1");
while (groupRs.next()) {
List<String> groupValues = new ArrayList<>();
for (int i = 1; i < numberGroups + 1; i++) {
var groupValue = groupRs.getString(i);
groupValues.add(groupValue);
}
var multipleGroupValues = buildKey(groupValues);
multiMemberGroupSet.add(multipleGroupValues);
}

ResultSet rs = stmt.executeQuery("select " + CdmFieldInfo.CDM_ID + ", " + multipleGroups
+ " from " + CdmIndexService.TB_NAME
+ " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null"
+ " order by " + CdmFieldInfo.CDM_ID + " ASC");
while (rs.next()) {
String cdmId = rs.getString(1);
List<String> matchedValues = new ArrayList<>();
for (int i = 2; i < numberGroups + 2; i++) {
var matchedValue = rs.getString(i);
matchedValues.add(matchedValue);
}
var multipleMatchedValues = buildKey(matchedValues);

// Add empty mapping for records either not in groups or in groups with fewer than 2 members
if (multipleMatchedValues == null || !multiMemberGroupSet.contains(multipleMatchedValues)) {
log.debug("No matching field for object {}", cdmId);
csvPrinter.printRecord(cdmId, null);
continue;
}

List<String> listGroups = new ArrayList<>();
for (int i = 0; i < numberGroups; i++) {
listGroups.add(options.getGroupFields().get(i) + ":" + matchedValues.get(i));
}
String groupKey = GroupMappingInfo.GROUPED_WORK_PREFIX + String.join(",", listGroups);
csvPrinter.printRecord(cdmId, groupKey);
}
}

private void assertProjectStateValid() {
if (project.getProjectProperties().getIndexedDate() == null) {
throw new InvalidProjectStateException("Project must be indexed prior to generating source mappings");
Expand Down Expand Up @@ -166,6 +185,14 @@ private void ensureMappingState(GroupMappingOptions options) {
}
}

private String buildKey(List<String> values) {
String joinedValues = null;
if (!values.stream().allMatch(String::isEmpty)) {
joinedValues = String.join(",", values);
}
return joinedValues;
}

/**
* Merge existing mappings with updated mappings, writing to temporary files as intermediates
* @param options
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import edu.unc.lib.boxc.deposit.impl.model.DepositModelHelpers;
import edu.unc.lib.boxc.migration.cdm.exceptions.InvalidProjectStateException;
import edu.unc.lib.boxc.migration.cdm.model.DestinationSipEntry;
import edu.unc.lib.boxc.migration.cdm.model.GroupMappingInfo;
import edu.unc.lib.boxc.migration.cdm.model.PermissionsInfo;
import edu.unc.lib.boxc.migration.cdm.model.SourceFilesInfo;
import edu.unc.lib.boxc.migration.cdm.options.SipGenerationOptions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ public void generateBasicMatchSucceedsTest() throws Exception {
assertTrue(Files.exists(project.getGroupMappingPath()));
}

@Test
public void generateMultipleMatchSucceedsTest() throws Exception {
indexExportSamples();
String[] args = new String[] {
"-w", project.getProjectPath().toString(),
"group_mapping", "generate",
"-n", "groupa,dcmi"};
executeExpectSuccess(args);

assertTrue(Files.exists(project.getGroupMappingPath()));
}

@Test
public void generateAndSyncTest() throws Exception {
indexExportSamples();
Expand Down Expand Up @@ -74,6 +86,33 @@ public void generateAndSyncTest() throws Exception {
}
}

@Test
public void generateAndSyncMultipleGroupsTest() throws Exception {
indexExportSamples();
String[] args = new String[] {
"-w", project.getProjectPath().toString(),
"group_mapping", "generate",
"-n", "groupa,dcmi" };
executeExpectSuccess(args);

assertTrue(Files.exists(project.getGroupMappingPath()));

String[] args2 = new String[] {
"-w", project.getProjectPath().toString(),
"group_mapping", "sync" };
executeExpectSuccess(args2);

var indexService = new CdmIndexService();
indexService.setProject(project);
Connection conn = null;
try {
conn = indexService.openDbConnection();
assertFilesGrouped(conn, "25", "26");
} finally {
CdmIndexService.closeDbConnection(conn);
}
}

@Test
public void statusNotGenerated() throws Exception {
String[] args = new String[] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;

import static org.junit.jupiter.api.Assertions.assertEquals;
Expand Down Expand Up @@ -404,7 +405,7 @@ private void assertMappingCount(int count) throws IOException {

private void setupGroupedIndex() throws Exception {
var options = new GroupMappingOptions();
options.setGroupField("groupa");
options.setGroupFields(Arrays.asList("groupa"));
testHelper.getGroupMappingService().generateMapping(options);
var syncOptions = new GroupMappingSyncOptions();
syncOptions.setSortField("file");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public void generateWithMatchAddToBottomTest() throws Exception {

private void setupGroupedIndex() throws Exception {
var options = new GroupMappingOptions();
options.setGroupField("groupa");
options.setGroupFields(Arrays.asList("groupa"));
testHelper.getGroupMappingService().generateMapping(options);
var syncOptions = new GroupMappingSyncOptions();
syncOptions.setSortField("file");
Expand Down
Loading

0 comments on commit e8607fe

Please sign in to comment.