Skip to content

Commit

Permalink
Similarity ranging calculation.
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrczarnas committed Oct 2, 2024
1 parent cb7cd94 commit f210056
Show file tree
Hide file tree
Showing 6 changed files with 375 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright © 2021 DQOps ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dqops.core.similarity;

/**
* Helper class to calculate a similarity score of sample values.
*/
public class DataSimilarityCalculator {
/**
* Hash seeds.
*/
private static final long[] SEEDS = new long[] { 0x80aee9b1521cff73L, 0xaef13661c3891612L, 0xd9b062cfb56a1592L, 0xe82be4fa6de9f1dcL };

/**
* Similarity score length in words.
*/
public static final int WORD_COUNT = SEEDS.length;
private long[] r = new long[SEEDS.length * 64];

/**
* Appends a hash
* @param h Hash.
* @param c Count.
*/
public void append(long h, long c) {
for (int w = 0; w < SEEDS.length; w++) {
long x = h ^ SEEDS[w];
x = (x ^ (x >>> 30)) * 0xbf58476d1ce4e5b9L;
x = (x ^ (x >>> 27)) * 0x94d049bb133111ebL;
x = x ^ (x >>> 31);

for (int b = 0; b < 64; b++) {
r[(w << 6) + b] += (x & (1L << b)) == x ? c : -c;
}
}
}

/**
* Returns a similarity score.
* @return Similarity score.
*/
public long[] getScore() {
long[] s = new long[WORD_COUNT];

for (int w = 0; w < s.length; w++) {
for (int b = 0; b < 64; b++) {
if (r[(w << 6) + b] >= 0L) {
s[w] |= 1L << b;
}
}
}

return s;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright © 2021 DQOps ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dqops.core.similarity;

/**
* Helper class that calculate a match between two similarity scores.
*/
public class DataSimilarityMatch {
/**
* Calculate a similarity score of two similarity codes.
* @param xa First similarity code.
* @param ya Second similarity code.
* @return Similarity score.
*/
public static int calculateMatch(long[] xa, long[] ya) {
int m = 0;
for (int i = 0; i < xa.length; i++) {
long x = xa[i] ^ ya[i];

x -= (x >>> 1) & 0x5555555555555555L;
x = (x & 0x3333333333333333L) + ((x >>> 2) & 0x3333333333333333L);
x = (x + (x >>> 4)) & 0x0f0f0f0f0f0f0f0fL;
x += x >>> 8;
x += x >>> 16;
x += x >>> 32;
m += (int)(x & 0x7f);
}

return m;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright © 2021 DQOps ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dqops.core.similarity;

import com.dqops.core.principal.UserDomainIdentity;
import com.dqops.metadata.similarity.TableSimilarityStore;
import com.dqops.metadata.sources.PhysicalTableName;

/**
* Table similarity score calculation service.
*/
public interface TableSimilarityScoreFactory {
/**
* Calculates a table similarity score from statistics.
*
* @param connectionName Connection name.
* @param physicalTableName Physical table name.
* @param userDomainIdentity User identity and the data domain.
* @return Table similarity score or null when the table has no statistics.
*/
TableSimilarityStore calculateSimilarityScore(String connectionName, PhysicalTableName physicalTableName, UserDomainIdentity userDomainIdentity);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright © 2021 DQOps ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.dqops.core.similarity;

import com.dqops.core.principal.UserDomainIdentity;
import com.dqops.data.normalization.CommonTableNormalizationService;
import com.dqops.data.statistics.models.StatisticsMetricModel;
import com.dqops.data.statistics.models.StatisticsResultsForColumnModel;
import com.dqops.data.statistics.models.StatisticsResultsForTableModel;
import com.dqops.data.statistics.services.StatisticsDataService;
import com.dqops.metadata.similarity.TableSimilarityStore;
import com.dqops.metadata.sources.PhysicalTableName;
import com.dqops.services.timezone.DefaultTimeZoneProvider;
import com.dqops.statistics.column.sampling.ColumnSamplingColumnSamplesStatisticsCollectorSpec;
import com.dqops.utils.conversion.DateTypesConverter;
import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.time.ZoneId;
import java.util.Objects;

/**
* Table similarity score calculation service.
*/
@Component
public class TableSimilarityScoreFactoryImpl implements TableSimilarityScoreFactory {
private final StatisticsDataService statisticsDataService;
private final DefaultTimeZoneProvider defaultTimeZoneProvider;

/**
* Dependency injection constructor.
* @param statisticsDataService Statistics data service.
* @param defaultTimeZoneProvider Default timezone provider.
*/
@Autowired
public TableSimilarityScoreFactoryImpl(
StatisticsDataService statisticsDataService,
DefaultTimeZoneProvider defaultTimeZoneProvider) {
this.statisticsDataService = statisticsDataService;
this.defaultTimeZoneProvider = defaultTimeZoneProvider;
}

/**
* Calculates a table similarity score from statistics.
* @param connectionName Connection name.
* @param physicalTableName Physical table name.
* @param userDomainIdentity User identity and the data domain.
* @return Table similarity score or null when the table has no statistics.
*/
@Override
public TableSimilarityStore calculateSimilarityScore(String connectionName, PhysicalTableName physicalTableName, UserDomainIdentity userDomainIdentity) {
ZoneId defaultTimeZoneId = this.defaultTimeZoneProvider.getDefaultTimeZoneId();
HashFunction hashFunction = Hashing.farmHashFingerprint64();
DataSimilarityCalculator tableSimilarityCalculator = new DataSimilarityCalculator();

TableSimilarityStore tableSimilarityStore = new TableSimilarityStore();
StatisticsResultsForTableModel mostRecentStatisticsForTable = this.statisticsDataService.getMostRecentStatisticsForTable(
connectionName, physicalTableName,
CommonTableNormalizationService.NO_GROUPING_DATA_GROUP_NAME, true, userDomainIdentity);

if (mostRecentStatisticsForTable == null) {
return null;
}

for (StatisticsResultsForColumnModel columnStatistics : mostRecentStatisticsForTable.getColumns().values()) {
DataSimilarityCalculator columnSimilarityCalculator = new DataSimilarityCalculator();

for (StatisticsMetricModel statisticsMetricModel : columnStatistics.getMetrics()) {
String sensorName = statisticsMetricModel.getSensorName();
if (Objects.equals(sensorName, ColumnSamplingColumnSamplesStatisticsCollectorSpec.SENSOR_NAME)) {
// column sampling sensor
Object sampleValue = statisticsMetricModel.getResult();
long sampleCount = statisticsMetricModel.getSampleCount() != null ? statisticsMetricModel.getSampleCount() : 1L;
Instant instantValue = DateTypesConverter.toInstant(sampleValue, defaultTimeZoneId);
String sampleValueString = sampleValue != null ? sampleValue.toString() : "";
if (instantValue != null) {
sampleValueString = instantValue.atZone(defaultTimeZoneId)
.toLocalDate().toString();
}

HashCode hashCode = hashFunction.hashString(sampleValueString, StandardCharsets.UTF_8);
columnSimilarityCalculator.append(hashCode.asLong(), sampleCount);
tableSimilarityCalculator.append(hashCode.asLong(), sampleCount);
}
}

tableSimilarityStore.getCs().put(columnStatistics.getColumnName(), columnSimilarityCalculator.getScore());
}

tableSimilarityStore.setTs(tableSimilarityCalculator.getScore());
return tableSimilarityStore;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,17 @@
import com.dqops.metadata.id.ChildHierarchyNodeFieldMap;
import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl;
import com.dqops.metadata.id.HierarchyNodeResultVisitor;
import com.dqops.metadata.sources.PhysicalTableName;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonPropertyDescription;
import com.fasterxml.jackson.databind.PropertyNamingStrategies;
import com.fasterxml.jackson.databind.annotation.JsonNaming;
import lombok.EqualsAndHashCode;
import lombok.ToString;

import java.util.LinkedHashMap;
import java.util.Map;

/**
* Table similarity score holder at a connection level that stores a score used to find the most similar tables.
*/
Expand All @@ -39,6 +44,9 @@ public class ConnectionSimilarityIndexSpec extends AbstractSpec implements Clone
}
};

@JsonPropertyDescription("Dictionary of scores for each table, identified by a schema and table.")
private Map<String, Map<String, TableSimilarityStore>> tables = new LinkedHashMap<>();

/**
* Returns the child map on the spec class with all fields.
*
Expand Down Expand Up @@ -67,6 +75,17 @@ public <P, R> R visit(HierarchyNodeResultVisitor<P, R> visitor, P parameter) {
*/
public ConnectionSimilarityIndexSpec deepClone() {
ConnectionSimilarityIndexSpec cloned = (ConnectionSimilarityIndexSpec) super.deepClone();
cloned.tables = new LinkedHashMap<>();

for (Map.Entry<String, Map<String, TableSimilarityStore>> schemaEntry : this.tables.entrySet()) {
LinkedHashMap<String, TableSimilarityStore> clonedTables = new LinkedHashMap<>();
for (Map.Entry<String, TableSimilarityStore> tableEntry : schemaEntry.getValue().entrySet()) {
clonedTables.put(tableEntry.getKey(), tableEntry.getValue().clone());
}

this.tables.put(schemaEntry.getKey(), clonedTables);
}

return cloned;
}
}
Loading

0 comments on commit f210056

Please sign in to comment.