From 421d5cea5c64e79f8df68d91b9eb2d84908fbccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 16 Jan 2025 16:52:42 +0100 Subject: [PATCH] datastore: improve MongoDB facets for arrays by using unwind, #TASK-7151, #TASK-7134 --- .../commons/datastore/core/FacetField.java | 7 + ...MongoDBDocumentToFacetFieldsConverter.java | 6 +- .../datastore/mongodb/MongoDBQueryUtils.java | 40 +++- .../mongodb/MongoDBCollectionTest.java | 211 +++++++++++++++++- 4 files changed, 242 insertions(+), 22 deletions(-) diff --git a/commons-datastore/commons-datastore-core/src/main/java/org/opencb/commons/datastore/core/FacetField.java b/commons-datastore/commons-datastore-core/src/main/java/org/opencb/commons/datastore/core/FacetField.java index 8beecd0d..8f125136 100644 --- a/commons-datastore/commons-datastore-core/src/main/java/org/opencb/commons/datastore/core/FacetField.java +++ b/commons-datastore/commons-datastore-core/src/main/java/org/opencb/commons/datastore/core/FacetField.java @@ -37,6 +37,13 @@ public FacetField(String name, long count, List buckets) { this.buckets = buckets; } + public FacetField(String name, long count, String aggregationName, List aggregationValues) { + this.name = name; + this.count = count; + this.aggregationName = aggregationName; + this.aggregationValues = aggregationValues; + } + public FacetField(String name, String aggregationName, List aggregationValues) { this.name = name; this.aggregationName = aggregationName; diff --git a/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBDocumentToFacetFieldsConverter.java b/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBDocumentToFacetFieldsConverter.java index 3da563f8..986ee990 100644 --- a/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBDocumentToFacetFieldsConverter.java +++ b/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBDocumentToFacetFieldsConverter.java @@ -152,7 +152,11 @@ public List convertToDataModelType(Document document) { } else { fieldValues.add(documentValue.getDouble(accumulator.name())); } - facets.add(new FacetField(documentValue.getString(INTERNAL_ID), accumulator.name(), fieldValues)); + long count = 0; + if (documentValue.containsKey("count")) { + count = Long.valueOf(documentValue.getInteger("count")); + } + facets.add(new FacetField(documentValue.getString(INTERNAL_ID), count, accumulator.name(), fieldValues)); break; } default: { diff --git a/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBQueryUtils.java b/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBQueryUtils.java index b8a0003c..1b166f4c 100644 --- a/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBQueryUtils.java +++ b/commons-datastore/commons-datastore-mongodb/src/main/java/org/opencb/commons/datastore/mongodb/MongoDBQueryUtils.java @@ -692,6 +692,7 @@ public static List createFacet(Bson query, String facetField) { private static List createFacet(Bson query, List facetFields) { List facetList = new ArrayList<>(); Set includeFields = new HashSet<>(); + List unwindList = new ArrayList<>(); // For each facet field passed we will create a MongoDB facet, thre are 4 types of facets: // 1. Facet combining fields with commas. In this case, only 'count' is supported as accumulator. @@ -739,8 +740,11 @@ private static List createFacet(Bson query, List facetFields) { double start = Double.parseDouble(matcher1.group(2)); double end = Double.parseDouble(matcher2.group(1)); double step = Double.parseDouble(matcher2.group(2)); - for (double i = start; i <= end; i += step) { - boundaries.add(i); + int numSections = (int) Math.ceil((end - start + 1) / step); + double boundary = start; + for (int i = 0; i < numSections + 1; i++) { + boundaries.add(boundary); + boundary += step; } } else { throw new IllegalArgumentException(INVALID_FORMAT_MSG + facetField + RANGE_FORMAT_MSG); @@ -769,6 +773,17 @@ private static List createFacet(Bson query, List facetFields) { // Get MongoDB facet facet = getMongoDBFacet(groupField, accumulator, accumulatorField, boundaries); + + // Unwind in any case + String[] split = groupField.split("\\."); + String acc = ""; + for (String s : split) { + if (!StringUtils.isEmpty(acc)) { + acc += "."; + } + acc += s; + unwindList.add(Aggregates.unwind("$" + acc)); + } } // Add facet to the list of facets to be executed @@ -777,14 +792,19 @@ private static List createFacet(Bson query, List facetFields) { } } - // Build MongoDB pipeline for facets - Bson match = Aggregates.match(query); - Bson project = Aggregates.project(Projections.include(new ArrayList<>(includeFields))); - // Dot notation management for facets - Document aggregates = GenericDocumentComplexConverter - .replaceDots(Document.parse(Aggregates.facet(facetList).toBsonDocument().toJson())); - - return Arrays.asList(match, project, aggregates); + // Build and return the MongoDB pipeline for facets: match, project, [unwind,] aggregates + List result = new ArrayList<>(); + // 1 - Match + result.add(Aggregates.match(query)); + // 2 - Project + result.add(Aggregates.project(Projections.include(new ArrayList<>(includeFields)))); + // 3 - Unwind + if (!unwindList.isEmpty()) { + result.addAll(unwindList); + } + // 4 - Aggregates (dot notation management for facets) + result.add(GenericDocumentComplexConverter.replaceDots(Document.parse(Aggregates.facet(facetList).toBsonDocument().toJson()))); + return result; } private static Facet getMongoDBFacet(String groupField, Accumulator accumulator, String accumulatorField, List boundaries) { diff --git a/commons-datastore/commons-datastore-mongodb/src/test/java/org/opencb/commons/datastore/mongodb/MongoDBCollectionTest.java b/commons-datastore/commons-datastore-mongodb/src/test/java/org/opencb/commons/datastore/mongodb/MongoDBCollectionTest.java index 1a5af4be..d0e70a60 100644 --- a/commons-datastore/commons-datastore-mongodb/src/test/java/org/opencb/commons/datastore/mongodb/MongoDBCollectionTest.java +++ b/commons-datastore/commons-datastore-mongodb/src/test/java/org/opencb/commons/datastore/mongodb/MongoDBCollectionTest.java @@ -156,7 +156,7 @@ private static MongoDBCollection createTestCollection(String test, int size) { house.put("numRooms", (int) (i % 7) + 1); house.put("m2", (int) i * 23); document.put("house", house); - int numDogs = random.nextInt(3); + int numDogs = random.nextInt(5); List dogs = new ArrayList<>(); for (int j = 0 ; j < numDogs; j++) { Document dog = new Document(); @@ -166,6 +166,7 @@ private static MongoDBCollection createTestCollection(String test, int size) { } document.put("dogs", dogs); mongoDBCollection.nativeQuery().insert(document, null); + System.out.println("document.toJson() = " + document.toJson()); } return mongoDBCollection; } @@ -625,6 +626,182 @@ public void testFacetBucketsDotNotation() { } } + @Test + public void testFacetCountBucketsArray() { + Document match = new Document("age", new BasicDBObject("$gt", 2)); + DataResult matchedResults = mongoDBCollection.find(match, null); + + String fieldName = "dogs.color"; + List facets = MongoDBQueryUtils.createFacet(match, fieldName); + System.out.println("facets = " + facets); + MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); + DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + for (List facetFieldList : aggregate.getResults()) { + System.out.println("facetFieldList = " + facetFieldList); + } + + String value; + long totalCount = 0; + Map map = new HashMap<>(); + for (Document result : matchedResults.getResults()) { + List dogs = (List) result.get("dogs"); + for (Document dog : dogs) { + totalCount++; + String color = dog.getString("color"); + if (StringUtils.isEmpty(color)) { + color = EMPTY; + map.put(color, 0); + } else if (!map.containsKey(color)) { + map.put(color, 0); + } + map.put(color, 1 + map.get(color)); + } + } + + for (List result : aggregate.getResults()) { + for (FacetField facetField : result) { + Assert.assertEquals(totalCount, facetField.getCount().longValue()); + Assert.assertEquals(map.size(), facetField.getBuckets().size()); + for (FacetField.Bucket bucket : facetField.getBuckets()) { + value = bucket.getValue(); + if (StringUtils.isEmpty(value)) { + value = EMPTY; + } + Assert.assertEquals(map.get(value).longValue(), bucket.getCount()); + } + } + } + } + + @Test + public void testFacetAvgBucketsArray() { + Document match = new Document("age", new BasicDBObject("$gt", 2)); + DataResult matchedResults = mongoDBCollection.find(match, null); + + String fieldName = "avg(dogs.age)"; + List facets = MongoDBQueryUtils.createFacet(match, fieldName); + System.out.println("facets = " + facets); + MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); + DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + for (List facetFieldList : aggregate.getResults()) { + System.out.println("facetFieldList = " + facetFieldList); + } + + int counter = 0; + int acc = 0; + for (Document doc : matchedResults.getResults()) { + List dogs = (List) doc.get("dogs"); + for (Document dog : dogs) { + counter++; + acc += (int) dog.get("age"); + } + } + System.out.println("counter = " + counter); + System.out.println("(acc/counter) = " + (1.0d * acc / counter)); + Assert.assertEquals(aggregate.getResults().get(0).get(0).getAggregationValues().get(0), 1.0d * acc / counter, 0.0001); + } + + @Test + public void testFacetFilterAccumulatorBucketsArray() { + Document match = new Document("age", new BasicDBObject("$gt", 2)); + DataResult matchedResults = mongoDBCollection.find(match, null); + + String fieldName = "dogs.color:avg(dogs.age)"; + List facets = MongoDBQueryUtils.createFacet(match, fieldName); + System.out.println("facets = " + facets); + MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); + DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + for (List facetFieldList : aggregate.getResults()) { + System.out.println("facetFieldList = " + facetFieldList); + } + + String value; + long totalCount = 0; + Map counterMap = new HashMap<>(); + Map accMap = new HashMap<>(); + for (Document result : matchedResults.getResults()) { + List dogs = (List) result.get("dogs"); + for (Document dog : dogs) { + totalCount++; + String color = dog.getString("color"); + int age = (int) dog.get("age"); + if (StringUtils.isEmpty(color)) { + color = EMPTY; + counterMap.put(color, 0); + accMap.put(color, 0); + } else if (!counterMap.containsKey(color)) { + counterMap.put(color, 0); + accMap.put(color, 0); + } + counterMap.put(color, 1 + counterMap.get(color)); + accMap.put(color, age + accMap.get(color)); + } + } + + for (List result : aggregate.getResults()) { + for (FacetField facetField : result) { + Assert.assertEquals(totalCount, facetField.getCount().longValue()); + Assert.assertEquals(counterMap.size(), facetField.getBuckets().size()); + for (FacetField.Bucket bucket : facetField.getBuckets()) { + value = bucket.getValue(); + if (StringUtils.isEmpty(value)) { + value = EMPTY; + } + Assert.assertEquals(counterMap.get(value).longValue(), bucket.getCount()); + Assert.assertEquals(counterMap.get(value).longValue(), bucket.getFacetFields().get(0).getCount().longValue()); + Assert.assertEquals("avg", bucket.getFacetFields().get(0).getAggregationName()); + Assert.assertEquals(1.0 * accMap.get(value) / counterMap.get(value), bucket.getFacetFields().get(0).getAggregationValues().get(0), 0.0001); + } + } + } + } + + @Test + public void testFacetRangeArray() { + Document match = new Document("age", new BasicDBObject("$gt", 2)); + DataResult matchedResults = mongoDBCollection.find(match, null); + + int start = 1; + int end = 20; + int step = 5; + String fieldName = "dogs.age" + RANGE_MARK1 + start + RANGE_MARK + end + RANGE_MARK2 + ":" + step; + List facets = MongoDBQueryUtils.createFacet(match, fieldName); + System.out.println("facets = " + facets); + MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); + DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + for (List facetFieldList : aggregate.getResults()) { + System.out.println("facetFieldList = " + facetFieldList); + } + + long outOfRange = 0; + List rangeValues = new ArrayList<>(Arrays.asList(0d, 0d, 0d, 0d)); + + Map map = new HashMap<>(); + for (Document result : matchedResults.getResults()) { + int bucketNum; + List dogs = (List) result.get("dogs"); + for (Document dog : dogs) { + int value = (int) dog.get("age"); + if (value < start || value > end) { + outOfRange++; + } else { + bucketNum = (int) (value - start) / step; + rangeValues.set(bucketNum, 1 + rangeValues.get(bucketNum)); + } + } + } + for (List result : aggregate.getResults()) { + Assert.assertEquals(1, result.size()); + for (FacetField facetField : result) { + Assert.assertTrue(facetField.getCount() == null); + Assert.assertTrue(facetField.getName().contains("" + (1.0d * outOfRange))); + for (int i = 0; i < facetField.getAggregationValues().size(); i++) { + Assert.assertEquals(rangeValues.get(i), facetField.getAggregationValues().get(i)); + } + } + } + } + @Test public void testFacetMax() { Document match = new Document("age", new BasicDBObject("$gt", 2)); @@ -635,10 +812,12 @@ public void testFacetMax() { MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + long totalCount = 0; double maxValue = 0; Map map = new HashMap<>(); for (Document result : matchedResults.getResults()) { Long value = result.getLong(fieldName); + totalCount++; if (value != null) { if (value > maxValue) { maxValue = value; @@ -648,7 +827,7 @@ public void testFacetMax() { for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) { - Assert.assertTrue(facetField.getCount() == null); + Assert.assertEquals(totalCount, facetField.getCount().longValue()); Assert.assertEquals(max.name(), facetField.getAggregationName()); Assert.assertEquals(maxValue, facetField.getAggregationValues().get(0), 0.0001); } @@ -665,10 +844,12 @@ public void testFacetMin() { MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + long count = 0; double minValue = Double.MAX_VALUE; Map map = new HashMap<>(); for (Document result : matchedResults.getResults()) { Long value = result.getLong(fieldName); + count++; if (value != null) { if (value < minValue) { minValue = value; @@ -678,7 +859,7 @@ public void testFacetMin() { for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) { - Assert.assertTrue(facetField.getCount() == null); + Assert.assertEquals(count, facetField.getCount().longValue()); Assert.assertEquals(min.name(), facetField.getAggregationName()); Assert.assertEquals(minValue, facetField.getAggregationValues().get(0), 0.0001); } @@ -708,7 +889,7 @@ public void testFacetAvg() { for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) { - Assert.assertTrue(facetField.getCount() == null); + Assert.assertEquals(totalCount, facetField.getCount().longValue()); Assert.assertEquals(avg.name(), facetField.getAggregationName()); Assert.assertEquals(totalSum / totalCount, facetField.getAggregationValues().get(0), 0.0001); } @@ -727,6 +908,7 @@ public void testFacetMaxDotNotationAndList() { DataResult aggregate2 = mongoDBCollection.aggregate(facets, null); + int count = 0; List maxValues = new ArrayList<>(Arrays.asList(0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D)); for (Document result : matchedResults.getResults()) { List dogs = (List) result.get("dogs"); @@ -734,6 +916,7 @@ public void testFacetMaxDotNotationAndList() { System.out.println(); for (int i = 0; i < dogs.size(); i++) { Number value = (Number) dogs.get(i).get("age"); + count++; System.out.print("age = " + result.getInteger("age") + "; i = " + i + "; value = " + value + "; "); if (value.doubleValue() > maxValues.get(i)) { maxValues.set(i, value.doubleValue()); @@ -744,7 +927,7 @@ public void testFacetMaxDotNotationAndList() { for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) { - Assert.assertTrue(facetField.getCount() == null); + Assert.assertEquals(count, facetField.getCount().longValue()); Assert.assertEquals(max.name(), facetField.getAggregationName()); // for (int i = 0; i < facetField.getAggregationValues().size() ; i++) { // Assert.assertEquals(maxValues.get(i), facetField.getAggregationValues().get(i), 0.0001); @@ -758,9 +941,11 @@ public void testFacetSumAccumulator() { Document match = new Document("age", new BasicDBObject("$gt", 2)); DataResult matchedResults = mongoDBCollection.find(match, null); int total = 0; + int count = 0; String fieldName = "number"; for (Document result : matchedResults.getResults()) { System.out.println("result = " + result); + count++; total += result.getLong(fieldName); } double avg = total / matchedResults.getNumResults(); @@ -771,7 +956,7 @@ public void testFacetSumAccumulator() { for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) { - Assert.assertTrue(facetField.getCount() == null); + Assert.assertEquals(count, facetField.getCount().longValue()); Assert.assertEquals(Accumulator.avg.name(), facetField.getAggregationName()); Assert.assertEquals(avg, facetField.getAggregationValues().get(0), 0.5); // for (int i = 0; i < facetField.getAggregationValues().size() ; i++) { @@ -786,7 +971,7 @@ public void testFacetSumAccumulator() { for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) { - Assert.assertTrue(facetField.getCount() == null); + Assert.assertEquals(count, facetField.getCount().longValue()); Assert.assertEquals(Accumulator.sum.name(), facetField.getAggregationName()); Assert.assertEquals(total, facetField.getAggregationValues().get(0), 0.0001); } @@ -912,25 +1097,29 @@ public void testFacetRange() { int step = 1000; String fieldName = "number" + RANGE_MARK1 + start + RANGE_MARK + end + RANGE_MARK2 + ":" + step; List facets = MongoDBQueryUtils.createFacet(match, fieldName); + System.out.println("facets = " + facets); MongoDBDocumentToFacetFieldsConverter converter = new MongoDBDocumentToFacetFieldsConverter(); DataResult> aggregate = mongoDBCollection.aggregate(facets, converter, null); + System.out.println("aggregate.first() = " + aggregate.first()); long outOfRange = 0; - List rangeValues = new ArrayList<>(Arrays.asList(0d, 0d, 0d, 0d)); + List rangeValues = new ArrayList<>(Arrays.asList(0d, 0d, 0d, 0d, 0d)); Map map = new HashMap<>(); for (Document result : matchedResults.getResults()) { int bucketNum; Long value = result.getLong("number"); if (value != null) { - if (value < start || value > end) { + bucketNum = (int) (value - start) / step; + int numSections = (int) Math.ceil((end - start + 1) / step); + if (value < start || bucketNum > numSections) { outOfRange++; } else { - bucketNum = (int) (value - start) / step; - rangeValues.set(bucketNum, 1 + rangeValues.get(bucketNum)); + rangeValues.set(bucketNum, 1 + rangeValues.get(bucketNum)); } } } + System.out.println("rangeValues.toString() = " + rangeValues.toString()); for (List result : aggregate.getResults()) { Assert.assertEquals(1, result.size()); for (FacetField facetField : result) {