diff --git a/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java b/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java index 822a7a68b30..1749a7fcfae 100644 --- a/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java +++ b/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java @@ -20,6 +20,7 @@ import org.cbioportal.persistence.enums.DataSource; import org.cbioportal.persistence.helper.AlterationFilterHelper; import org.cbioportal.persistence.helper.StudyViewFilterHelper; +import org.cbioportal.service.util.StudyViewColumnarServiceUtil; import org.cbioportal.web.parameter.ClinicalDataType; import org.cbioportal.web.parameter.GenericAssayDataBinFilter; import org.cbioportal.web.parameter.GenericAssayDataFilter; @@ -28,7 +29,6 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Repository; -import java.util.ArrayList; import java.util.Collections; import java.util.EnumMap; import java.util.HashMap; @@ -90,27 +90,7 @@ public List getClinicalDataCounts(StudyViewFilterContext @Override public List getMolecularProfileSampleCounts(StudyViewFilterContext studyViewFilterContext) { var sampleCounts = mapper.getMolecularProfileSampleCounts(createStudyViewFilterHelper(studyViewFilterContext)); - Map> countsPerType = sampleCounts.stream() - .collect((Collectors.groupingBy(GenomicDataCount::getValue))); - - // different cancer studies combined into one cohort will have separate molecular profiles - // of a given type (e.g. mutation). We need to merge the counts for these - // different profiles based on the type and choose a label - // this code just picks the first label, which assumes that the labels will match - // across studies. - List mergedCounts = new ArrayList<>(); - for (Map.Entry> entry : countsPerType.entrySet()) { - var dc = new GenomicDataCount(); - dc.setValue(entry.getKey()); - // here just snatch the label of the first profile - dc.setLabel(entry.getValue().get(0).getLabel()); - Integer sum = entry.getValue().stream() - .map(x -> x.getCount()) - .collect(Collectors.summingInt(Integer::intValue)); - dc.setCount(sum); - mergedCounts.add(dc); - } - return mergedCounts; + return StudyViewColumnarServiceUtil.mergeGenomicDataCounts(sampleCounts); } diff --git a/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java b/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java index f9cd1facb16..7063f64f560 100644 --- a/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java +++ b/src/main/java/org/cbioportal/service/impl/StudyViewColumnarServiceImpl.java @@ -214,7 +214,7 @@ public List getCaseListDataCounts(StudyViewFilter studyViewFi // the study view merges case lists by type across studies // type is determined by the suffix of case list name (after study name) var caseListDataCountsPerStudy = studyViewRepository.getCaseListDataCountsPerStudy(createContext(studyViewFilter)); - return mergeCaseListCounts(caseListDataCountsPerStudy); + return StudyViewColumnarServiceUtil.mergeCaseListCounts(caseListDataCountsPerStudy); } @Cacheable( @@ -294,85 +294,11 @@ private List generateDataCountItemsFromDataCounts(List { ClinicalDataCountItem item = new ClinicalDataCountItem(); item.setAttributeId(e.getKey()); - item.setCounts(normalizeDataCounts(e.getValue())); + item.setCounts(StudyViewColumnarServiceUtil.normalizeDataCounts(e.getValue())); return item; }).toList(); } - /** - * Normalizes data counts by merging attribute values in a case-insensitive way. - * For example attribute values "TRUE", "True", and 'true' will be merged into a single aggregated count. - * This method assumes that all the counts in the given dataCounts list has the same attributeId. - * - * @param dataCounts list of data counts for a single attribute - * - * @return normalized list of data counts - */ - private List normalizeDataCounts(List dataCounts) { - Collection normalizedDataCounts = dataCounts - .stream() - .collect( - Collectors.groupingBy( - c -> c.getValue().toLowerCase(), - Collectors.reducing(new ClinicalDataCount(), (count1, count2) -> { - // assuming attribute ids are the same for all data counts, just pick the first one - String attributeId = - count1.getAttributeId() != null - ? count1.getAttributeId() - : count2.getAttributeId(); - - // pick the value in a deterministic way by prioritizing lower case over upper case. - // for example, 'True' will be picked in case of 2 different values like 'TRUE', and 'True', - // and 'true' will be picked in case of 3 different values like 'TRUE', 'True', and 'true' - String value = count1.getValue() != null - ? count1.getValue() - : count2.getValue(); - if (count1.getValue() != null && count2.getValue() != null) { - value = count1.getValue().compareTo(count2.getValue()) > 0 - ? count1.getValue() - : count2.getValue(); - } - - // aggregate counts for the merged values - Integer count = (count1.getCount() != null ? count1.getCount(): 0) + - (count2.getCount() != null ? count2.getCount(): 0); - - ClinicalDataCount aggregated = new ClinicalDataCount(); - aggregated.setAttributeId(attributeId); - aggregated.setValue(value); - aggregated.setCount(count); - return aggregated; - }) - ) - ) - .values(); - - return new ArrayList<>(normalizedDataCounts); - } - - public static List mergeCaseListCounts(List counts) { - Map> countsPerListType = counts.stream() - .collect((Collectors.groupingBy(CaseListDataCount::getValue))); - - // different cancer studies combined into one cohort will have separate case lists - // of a given type (e.g. rppa). We need to merge the counts for these - // different lists based on the type and choose a label - // this code just picks the first label, which assumes that the labels will match for a give type - List mergedCounts = new ArrayList<>(); - for (Map.Entry> entry : countsPerListType.entrySet()) { - var dc = new CaseListDataCount(); - dc.setValue(entry.getKey()); - // here just snatch the label of the first profile - dc.setLabel(entry.getValue().get(0).getLabel()); - Integer sum = entry.getValue().stream() - .map(x -> x.getCount()) - .collect(Collectors.summingInt(Integer::intValue)); - dc.setCount(sum); - mergedCounts.add(dc); - } - return mergedCounts; - } - } diff --git a/src/main/java/org/cbioportal/service/util/StudyViewColumnarServiceUtil.java b/src/main/java/org/cbioportal/service/util/StudyViewColumnarServiceUtil.java index 4975cb9730b..3a4bcc7c34a 100644 --- a/src/main/java/org/cbioportal/service/util/StudyViewColumnarServiceUtil.java +++ b/src/main/java/org/cbioportal/service/util/StudyViewColumnarServiceUtil.java @@ -1,9 +1,13 @@ package org.cbioportal.service.util; +import org.cbioportal.model.CaseListDataCount; import org.cbioportal.model.ClinicalAttribute; import org.cbioportal.model.ClinicalDataCount; import org.cbioportal.model.ClinicalDataCountItem; +import org.cbioportal.model.GenomicDataCount; + import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -58,6 +62,98 @@ public static List addClinicalDataCountsForMissingAttribu return result; } + + public static List mergeGenomicDataCounts(List sampleCounts) { + Map> countsPerType = sampleCounts.stream() + .collect(Collectors.groupingBy(GenomicDataCount::getValue)); + + List mergedCounts = new ArrayList<>(); + for (Map.Entry> entry : countsPerType.entrySet()) { + var dc = new GenomicDataCount(); + dc.setValue(entry.getKey()); + dc.setLabel(entry.getValue().get(0).getLabel()); + Integer sum = entry.getValue().stream() + .map(GenomicDataCount::getCount) + .collect(Collectors.summingInt(Integer::intValue)); + dc.setCount(sum); + mergedCounts.add(dc); + } + return mergedCounts; + } + + public static List mergeCaseListCounts(List counts) { + Map> countsPerListType = counts.stream() + .collect((Collectors.groupingBy(CaseListDataCount::getValue))); + + // different cancer studies combined into one cohort will have separate case lists + // of a given type (e.g. rppa). We need to merge the counts for these + // different lists based on the type and choose a label + // this code just picks the first label, which assumes that the labels will match for a give type + List mergedCounts = new ArrayList<>(); + for (Map.Entry> entry : countsPerListType.entrySet()) { + var dc = new CaseListDataCount(); + dc.setValue(entry.getKey()); + // here just snatch the label of the first profile + dc.setLabel(entry.getValue().get(0).getLabel()); + Integer sum = entry.getValue().stream() + .map(x -> x.getCount()) + .collect(Collectors.summingInt(Integer::intValue)); + dc.setCount(sum); + mergedCounts.add(dc); + } + return mergedCounts; + } + + /** + * Normalizes data counts by merging attribute values in a case-insensitive way. + * For example attribute values "TRUE", "True", and 'true' will be merged into a single aggregated count. + * This method assumes that all the counts in the given dataCounts list has the same attributeId. + * + * @param dataCounts list of data counts for a single attribute + * + * @return normalized list of data counts + */ + public static List normalizeDataCounts(List dataCounts) { + Collection normalizedDataCounts = dataCounts + .stream() + .collect( + Collectors.groupingBy( + c -> c.getValue().toLowerCase(), + Collectors.reducing(new ClinicalDataCount(), (count1, count2) -> { + // assuming attribute ids are the same for all data counts, just pick the first one + String attributeId = + count1.getAttributeId() != null + ? count1.getAttributeId() + : count2.getAttributeId(); + + // pick the value in a deterministic way by prioritizing lower case over upper case. + // for example, 'True' will be picked in case of 2 different values like 'TRUE', and 'True', + // and 'true' will be picked in case of 3 different values like 'TRUE', 'True', and 'true' + String value = count1.getValue() != null + ? count1.getValue() + : count2.getValue(); + if (count1.getValue() != null && count2.getValue() != null) { + value = count1.getValue().compareTo(count2.getValue()) > 0 + ? count1.getValue() + : count2.getValue(); + } + + // aggregate counts for the merged values + Integer count = (count1.getCount() != null ? count1.getCount(): 0) + + (count2.getCount() != null ? count2.getCount(): 0); + + ClinicalDataCount aggregated = new ClinicalDataCount(); + aggregated.setAttributeId(attributeId); + aggregated.setValue(value); + aggregated.setCount(count); + return aggregated; + }) + ) + ) + .values(); + + return new ArrayList<>(normalizedDataCounts); + } } \ No newline at end of file diff --git a/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewCaseListSamplesCountsTest.java b/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewCaseListSamplesCountsTest.java index c37c38c4d61..551eaa30b19 100644 --- a/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewCaseListSamplesCountsTest.java +++ b/src/test/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewCaseListSamplesCountsTest.java @@ -3,7 +3,7 @@ import org.cbioportal.persistence.helper.StudyViewFilterHelper; import org.cbioportal.persistence.mybatisclickhouse.config.MyBatisConfig; -import org.cbioportal.service.impl.StudyViewColumnarServiceImpl; +import org.cbioportal.service.util.StudyViewColumnarServiceUtil; import org.cbioportal.web.parameter.StudyViewFilter; import org.junit.Test; import org.junit.runner.RunWith; @@ -102,7 +102,7 @@ public void getMolecularProfileCountsAcrossStudies() { var unMergedCounts = studyViewMapper.getCaseListDataCountsPerStudy(StudyViewFilterHelper.build(studyViewFilter, null, null) ); - var caseListCountsMerged = StudyViewColumnarServiceImpl.mergeCaseListCounts( + var caseListCountsMerged = StudyViewColumnarServiceUtil.mergeCaseListCounts( unMergedCounts ); diff --git a/src/test/java/org/cbioportal/service/util/StudyViewColumnarServiceUtilTest.java b/src/test/java/org/cbioportal/service/util/StudyViewColumnarServiceUtilTest.java index 9e4b0aabebf..58308717c01 100644 --- a/src/test/java/org/cbioportal/service/util/StudyViewColumnarServiceUtilTest.java +++ b/src/test/java/org/cbioportal/service/util/StudyViewColumnarServiceUtilTest.java @@ -1,8 +1,10 @@ package org.cbioportal.service.util; +import org.cbioportal.model.CaseListDataCount; import org.cbioportal.model.ClinicalAttribute; import org.cbioportal.model.ClinicalDataCount; import org.cbioportal.model.ClinicalDataCountItem; +import org.cbioportal.model.GenomicDataCount; import org.junit.Assert; import org.junit.Test; @@ -122,10 +124,145 @@ public void testAddClinicalDataCountsForMissingAttributes() { } - - - - + + + @Test + public void testMergeGenomicDataCounts() { + GenomicDataCount count1 = new GenomicDataCount(); + count1.setValue("value1"); + count1.setLabel("label1"); + count1.setCount(1); + + GenomicDataCount count2 = new GenomicDataCount(); + count2.setValue("value1"); + count2.setLabel("label1"); + count2.setCount(2); + + GenomicDataCount count3 = new GenomicDataCount(); + count3.setValue("value2"); + count3.setLabel("label2"); + count3.setCount(3); + + List counts = Arrays.asList(count1, count2, count3); + + List mergedCounts = StudyViewColumnarServiceUtil.mergeGenomicDataCounts(counts); + + assertEquals(2, mergedCounts.size()); + + GenomicDataCount mergedCount1 = mergedCounts.stream() + .filter(count -> count.getValue().equals("value1")) + .findFirst() + .orElse(null); + assertEquals(3, mergedCount1.getCount().intValue()); + assertEquals("label1", mergedCount1.getLabel()); + + GenomicDataCount mergedCount2 = mergedCounts.stream() + .filter(count -> count.getValue().equals("value2")) + .findFirst() + .orElse(null); + assertEquals(3, mergedCount2.getCount().intValue()); + assertEquals("label2", mergedCount2.getLabel()); + } + + + @Test + public void testMergeCaseListCounts() { + CaseListDataCount count1 = new CaseListDataCount(); + count1.setValue("value1"); + count1.setLabel("label1"); + count1.setCount(1); + + CaseListDataCount count2 = new CaseListDataCount(); + count2.setValue("value1"); + count2.setLabel("label1"); + count2.setCount(2); + + CaseListDataCount count3 = new CaseListDataCount(); + count3.setValue("value2"); + count3.setLabel("label2"); + count3.setCount(3); + + List counts = Arrays.asList(count1, count2, count3); + + List mergedCounts = StudyViewColumnarServiceUtil.mergeCaseListCounts(counts); + + assertEquals(2, mergedCounts.size()); + + CaseListDataCount mergedCount1 = mergedCounts.stream() + .filter(count -> count.getValue().equals("value1")) + .findFirst() + .orElse(null); + assertEquals(3, mergedCount1.getCount().intValue()); + assertEquals("label1", mergedCount1.getLabel()); + + CaseListDataCount mergedCount2 = mergedCounts.stream() + .filter(count -> count.getValue().equals("value2")) + .findFirst() + .orElse(null); + assertEquals(3, mergedCount2.getCount().intValue()); + assertEquals("label2", mergedCount2.getLabel()); + } + + + @Test + public void testNormalizeDataCounts() { + ClinicalDataCount count1 = new ClinicalDataCount(); + count1.setAttributeId("attr1"); + count1.setValue("TRUE"); + count1.setCount(1); + + ClinicalDataCount count2 = new ClinicalDataCount(); + count2.setAttributeId("attr1"); + count2.setValue("True"); + count2.setCount(2); + + ClinicalDataCount count3 = new ClinicalDataCount(); + count3.setAttributeId("attr1"); + count3.setValue("true"); + count3.setCount(3); + + ClinicalDataCount count4 = new ClinicalDataCount(); + count4.setAttributeId("attr1"); + count4.setValue("FALSE"); + count4.setCount(4); + + ClinicalDataCount count5 = new ClinicalDataCount(); + count5.setAttributeId("attr1"); + count5.setValue("False"); + count5.setCount(5); + + List dataCounts = Arrays.asList(count1, count2, count3, count4, count5); + + List normalizedDataCounts = StudyViewColumnarServiceUtil.normalizeDataCounts(dataCounts); + + assertEquals(2, normalizedDataCounts.size()); + + // should be null because it prioritizes lower case over upper case + ClinicalDataCount trueCountNullCheck = normalizedDataCounts.stream() + .filter(count -> count.getValue().equals("True")) + .findFirst() + .orElse(null); + assertEquals(null, trueCountNullCheck); + + ClinicalDataCount trueCount = normalizedDataCounts.stream() + .filter(count -> count.getValue().equals("true")) + .findFirst() + .orElse(null); + assertEquals(6, trueCount.getCount().intValue()); + + // should be null because it prioritizes lower case over upper case + ClinicalDataCount falseCountNullCheck = normalizedDataCounts.stream() + .filter(count -> count.getValue().equals("FALSE")) + .findFirst() + .orElse(null); + assertEquals(null, falseCountNullCheck); + + ClinicalDataCount falseCount = normalizedDataCounts.stream() + .filter(count -> count.getValue().equals("False")) + .findFirst() + .orElse(null); + assertEquals(9, falseCount.getCount().intValue()); + } } \ No newline at end of file