diff --git a/dqops/src/main/java/com/dqops/core/similarity/DataSimilarityCalculator.java b/dqops/src/main/java/com/dqops/core/similarity/DataSimilarityCalculator.java new file mode 100644 index 0000000000..4daf69c290 --- /dev/null +++ b/dqops/src/main/java/com/dqops/core/similarity/DataSimilarityCalculator.java @@ -0,0 +1,69 @@ +/* + * Copyright © 2021 DQOps (support@dqops.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dqops.core.similarity; + +/** + * Helper class to calculate a similarity score of sample values. + */ +public class DataSimilarityCalculator { + /** + * Hash seeds. + */ + private static final long[] SEEDS = new long[] { 0x80aee9b1521cff73L, 0xaef13661c3891612L, 0xd9b062cfb56a1592L, 0xe82be4fa6de9f1dcL }; + + /** + * Similarity score length in words. + */ + public static final int WORD_COUNT = SEEDS.length; + private long[] r = new long[SEEDS.length * 64]; + + /** + * Appends a hash + * @param h Hash. + * @param c Count. + */ + public void append(long h, long c) { + for (int w = 0; w < SEEDS.length; w++) { + long x = h ^ SEEDS[w]; + x = (x ^ (x >>> 30)) * 0xbf58476d1ce4e5b9L; + x = (x ^ (x >>> 27)) * 0x94d049bb133111ebL; + x = x ^ (x >>> 31); + + for (int b = 0; b < 64; b++) { + r[(w << 6) + b] += (x & (1L << b)) == x ? c : -c; + } + } + } + + /** + * Returns a similarity score. + * @return Similarity score. + */ + public long[] getScore() { + long[] s = new long[WORD_COUNT]; + + for (int w = 0; w < s.length; w++) { + for (int b = 0; b < 64; b++) { + if (r[(w << 6) + b] >= 0L) { + s[w] |= 1L << b; + } + } + } + + return s; + } +} diff --git a/dqops/src/main/java/com/dqops/core/similarity/DataSimilarityMatch.java b/dqops/src/main/java/com/dqops/core/similarity/DataSimilarityMatch.java new file mode 100644 index 0000000000..eaedf09355 --- /dev/null +++ b/dqops/src/main/java/com/dqops/core/similarity/DataSimilarityMatch.java @@ -0,0 +1,45 @@ +/* + * Copyright © 2021 DQOps (support@dqops.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dqops.core.similarity; + +/** + * Helper class that calculate a match between two similarity scores. + */ +public class DataSimilarityMatch { + /** + * Calculate a similarity score of two similarity codes. + * @param xa First similarity code. + * @param ya Second similarity code. + * @return Similarity score. + */ + public static int calculateMatch(long[] xa, long[] ya) { + int m = 0; + for (int i = 0; i < xa.length; i++) { + long x = xa[i] ^ ya[i]; + + x -= (x >>> 1) & 0x5555555555555555L; + x = (x & 0x3333333333333333L) + ((x >>> 2) & 0x3333333333333333L); + x = (x + (x >>> 4)) & 0x0f0f0f0f0f0f0f0fL; + x += x >>> 8; + x += x >>> 16; + x += x >>> 32; + m += (int)(x & 0x7f); + } + + return m; + } +} diff --git a/dqops/src/main/java/com/dqops/core/similarity/TableSimilarityScoreFactory.java b/dqops/src/main/java/com/dqops/core/similarity/TableSimilarityScoreFactory.java new file mode 100644 index 0000000000..4fc5558094 --- /dev/null +++ b/dqops/src/main/java/com/dqops/core/similarity/TableSimilarityScoreFactory.java @@ -0,0 +1,36 @@ +/* + * Copyright © 2021 DQOps (support@dqops.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dqops.core.similarity; + +import com.dqops.core.principal.UserDomainIdentity; +import com.dqops.metadata.similarity.TableSimilarityStore; +import com.dqops.metadata.sources.PhysicalTableName; + +/** + * Table similarity score calculation service. + */ +public interface TableSimilarityScoreFactory { + /** + * Calculates a table similarity score from statistics. + * + * @param connectionName Connection name. + * @param physicalTableName Physical table name. + * @param userDomainIdentity User identity and the data domain. + * @return Table similarity score or null when the table has no statistics. + */ + TableSimilarityStore calculateSimilarityScore(String connectionName, PhysicalTableName physicalTableName, UserDomainIdentity userDomainIdentity); +} diff --git a/dqops/src/main/java/com/dqops/core/similarity/TableSimilarityScoreFactoryImpl.java b/dqops/src/main/java/com/dqops/core/similarity/TableSimilarityScoreFactoryImpl.java new file mode 100644 index 0000000000..37403e100b --- /dev/null +++ b/dqops/src/main/java/com/dqops/core/similarity/TableSimilarityScoreFactoryImpl.java @@ -0,0 +1,112 @@ +/* + * Copyright © 2021 DQOps (support@dqops.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dqops.core.similarity; + +import com.dqops.core.principal.UserDomainIdentity; +import com.dqops.data.normalization.CommonTableNormalizationService; +import com.dqops.data.statistics.models.StatisticsMetricModel; +import com.dqops.data.statistics.models.StatisticsResultsForColumnModel; +import com.dqops.data.statistics.models.StatisticsResultsForTableModel; +import com.dqops.data.statistics.services.StatisticsDataService; +import com.dqops.metadata.similarity.TableSimilarityStore; +import com.dqops.metadata.sources.PhysicalTableName; +import com.dqops.services.timezone.DefaultTimeZoneProvider; +import com.dqops.statistics.column.sampling.ColumnSamplingColumnSamplesStatisticsCollectorSpec; +import com.dqops.utils.conversion.DateTypesConverter; +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.time.ZoneId; +import java.util.Objects; + +/** + * Table similarity score calculation service. + */ +@Component +public class TableSimilarityScoreFactoryImpl implements TableSimilarityScoreFactory { + private final StatisticsDataService statisticsDataService; + private final DefaultTimeZoneProvider defaultTimeZoneProvider; + + /** + * Dependency injection constructor. + * @param statisticsDataService Statistics data service. + * @param defaultTimeZoneProvider Default timezone provider. + */ + @Autowired + public TableSimilarityScoreFactoryImpl( + StatisticsDataService statisticsDataService, + DefaultTimeZoneProvider defaultTimeZoneProvider) { + this.statisticsDataService = statisticsDataService; + this.defaultTimeZoneProvider = defaultTimeZoneProvider; + } + + /** + * Calculates a table similarity score from statistics. + * @param connectionName Connection name. + * @param physicalTableName Physical table name. + * @param userDomainIdentity User identity and the data domain. + * @return Table similarity score or null when the table has no statistics. + */ + @Override + public TableSimilarityStore calculateSimilarityScore(String connectionName, PhysicalTableName physicalTableName, UserDomainIdentity userDomainIdentity) { + ZoneId defaultTimeZoneId = this.defaultTimeZoneProvider.getDefaultTimeZoneId(); + HashFunction hashFunction = Hashing.farmHashFingerprint64(); + DataSimilarityCalculator tableSimilarityCalculator = new DataSimilarityCalculator(); + + TableSimilarityStore tableSimilarityStore = new TableSimilarityStore(); + StatisticsResultsForTableModel mostRecentStatisticsForTable = this.statisticsDataService.getMostRecentStatisticsForTable( + connectionName, physicalTableName, + CommonTableNormalizationService.NO_GROUPING_DATA_GROUP_NAME, true, userDomainIdentity); + + if (mostRecentStatisticsForTable == null) { + return null; + } + + for (StatisticsResultsForColumnModel columnStatistics : mostRecentStatisticsForTable.getColumns().values()) { + DataSimilarityCalculator columnSimilarityCalculator = new DataSimilarityCalculator(); + + for (StatisticsMetricModel statisticsMetricModel : columnStatistics.getMetrics()) { + String sensorName = statisticsMetricModel.getSensorName(); + if (Objects.equals(sensorName, ColumnSamplingColumnSamplesStatisticsCollectorSpec.SENSOR_NAME)) { + // column sampling sensor + Object sampleValue = statisticsMetricModel.getResult(); + long sampleCount = statisticsMetricModel.getSampleCount() != null ? statisticsMetricModel.getSampleCount() : 1L; + Instant instantValue = DateTypesConverter.toInstant(sampleValue, defaultTimeZoneId); + String sampleValueString = sampleValue != null ? sampleValue.toString() : ""; + if (instantValue != null) { + sampleValueString = instantValue.atZone(defaultTimeZoneId) + .toLocalDate().toString(); + } + + HashCode hashCode = hashFunction.hashString(sampleValueString, StandardCharsets.UTF_8); + columnSimilarityCalculator.append(hashCode.asLong(), sampleCount); + tableSimilarityCalculator.append(hashCode.asLong(), sampleCount); + } + } + + tableSimilarityStore.getCs().put(columnStatistics.getColumnName(), columnSimilarityCalculator.getScore()); + } + + tableSimilarityStore.setTs(tableSimilarityCalculator.getScore()); + return tableSimilarityStore; + } +} diff --git a/dqops/src/main/java/com/dqops/metadata/similarity/ConnectionSimilarityIndexSpec.java b/dqops/src/main/java/com/dqops/metadata/similarity/ConnectionSimilarityIndexSpec.java index 09cce6782b..d1e66eaa4f 100644 --- a/dqops/src/main/java/com/dqops/metadata/similarity/ConnectionSimilarityIndexSpec.java +++ b/dqops/src/main/java/com/dqops/metadata/similarity/ConnectionSimilarityIndexSpec.java @@ -20,12 +20,17 @@ import com.dqops.metadata.id.ChildHierarchyNodeFieldMap; import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl; import com.dqops.metadata.id.HierarchyNodeResultVisitor; +import com.dqops.metadata.sources.PhysicalTableName; import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonPropertyDescription; import com.fasterxml.jackson.databind.PropertyNamingStrategies; import com.fasterxml.jackson.databind.annotation.JsonNaming; import lombok.EqualsAndHashCode; import lombok.ToString; +import java.util.LinkedHashMap; +import java.util.Map; + /** * Table similarity score holder at a connection level that stores a score used to find the most similar tables. */ @@ -39,6 +44,9 @@ public class ConnectionSimilarityIndexSpec extends AbstractSpec implements Clone } }; + @JsonPropertyDescription("Dictionary of scores for each table, identified by a schema and table.") + private Map> tables = new LinkedHashMap<>(); + /** * Returns the child map on the spec class with all fields. * @@ -67,6 +75,17 @@ public R visit(HierarchyNodeResultVisitor visitor, P parameter) { */ public ConnectionSimilarityIndexSpec deepClone() { ConnectionSimilarityIndexSpec cloned = (ConnectionSimilarityIndexSpec) super.deepClone(); + cloned.tables = new LinkedHashMap<>(); + + for (Map.Entry> schemaEntry : this.tables.entrySet()) { + LinkedHashMap clonedTables = new LinkedHashMap<>(); + for (Map.Entry tableEntry : schemaEntry.getValue().entrySet()) { + clonedTables.put(tableEntry.getKey(), tableEntry.getValue().clone()); + } + + this.tables.put(schemaEntry.getKey(), clonedTables); + } + return cloned; } } diff --git a/dqops/src/main/java/com/dqops/metadata/similarity/TableSimilarityStore.java b/dqops/src/main/java/com/dqops/metadata/similarity/TableSimilarityStore.java new file mode 100644 index 0000000000..b9cfd93038 --- /dev/null +++ b/dqops/src/main/java/com/dqops/metadata/similarity/TableSimilarityStore.java @@ -0,0 +1,94 @@ +/* + * Copyright © 2021 DQOps (support@dqops.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dqops.metadata.similarity; + +import com.dqops.utils.exceptions.DqoRuntimeException; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Table similarity score. + */ +@Data +@EqualsAndHashCode +public class TableSimilarityStore implements Cloneable { + private long[] ts; + private Map cs = new LinkedHashMap<>(); + + /** + * Returns table similarity scores. + * @return Table similarity scores. + */ + public long[] getTs() { + return ts; + } + + /** + * Sets table similarity scores. + * @param ts Table similarity scores. + */ + public void setTs(long[] ts) { + this.ts = ts; + } + + /** + * Returns column similarity scores. + * @return Column similarity scores. + */ + public Map getCs() { + return cs; + } + + /** + * Sets column similarity scores. + * @param cs Column similarity scores. + */ + public void setCs(Map cs) { + this.cs = cs; + } + + /** + * Creates and returns a copy of this object. + */ + @Override + public TableSimilarityStore clone() { + try { + TableSimilarityStore clone = (TableSimilarityStore) super.clone(); + if (this.ts != null) { + clone.ts = Arrays.copyOf(this.ts, this.ts.length); + } + + clone.cs = new LinkedHashMap<>(); + for (Map.Entry csEntry : this.cs.entrySet()) { + if (csEntry.getValue() == null) { + continue; + } + + clone.cs.put(csEntry.getKey(), Arrays.copyOf(csEntry.getValue(), csEntry.getValue().length)); + } + + return clone; + } + catch (CloneNotSupportedException cex) { + throw new DqoRuntimeException("Clone not supported", cex); + } + } +}