Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Euclidean distance and Cosine similarity functions on dense vectors #23982

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
12 changes: 12 additions & 0 deletions presto-docs/src/main/sphinx/functions/math.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ Mathematical Functions

SELECT cosine_similarity(MAP(ARRAY['a'], ARRAY[1.0]), MAP(ARRAY['a'], ARRAY[2.0])); -- 1.0

.. function:: cosine_similarity_dense(x, y) -> double

Returns the cosine similarity between the dense vectors ``x`` and ``y``::

SELECT cosine_similarity_dense(ARRAY[1.0], ARRAY[2.0]); -- 1.0

.. function:: euclidean_distance_dense(x, y) -> double

Returns the euclidean distance between the dense vectors ``x`` and ``y``::

SELECT euclidean_distance_dense(ARRAY[1.0], ARRAY[2.0]); -- 1.0

.. function:: degrees(x) -> double

Converts angle ``x`` in radians to degrees.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1649,4 +1649,110 @@ private static Double mapL2Norm(Block map)

return Math.sqrt(norm);
}

@Description("cosine similarity between two vectors given as arrays")
@ScalarFunction("cosine_similarity_dense")
@SqlNullable
@SqlType(StandardTypes.DOUBLE)
public static Double cosineSimilarityDense(@SqlType("array(double)") Block leftArray, @SqlType("array(double)") Block rightArray)
{
Double normLeftArray = arrayL2Norm(leftArray);
Double normRightArray = arrayL2Norm(rightArray);

if (normLeftArray == null || normRightArray == null) {
return null;
}

double dotProduct = arrayDotProduct(leftArray, rightArray);

return dotProduct / (normLeftArray * normRightArray);
}

private static double arrayDotProduct(Block leftArray, Block rightArray)
{
int maxIndex = 0;
double result = 0.0;

if (leftArray.getPositionCount() < rightArray.getPositionCount()) {
maxIndex = leftArray.getPositionCount();
}
else {
maxIndex = rightArray.getPositionCount();
}

for (int i = 0; i < maxIndex; i++) {
result += DOUBLE.getDouble(leftArray, i) * DOUBLE.getDouble(rightArray, i);
}

return result;
}

private static Double arrayL2Norm(Block array)
{
double l2norm = 0.0;

for (int i = 0; i < array.getPositionCount(); i++) {
if (array.isNull(i)) {
return null;
}
l2norm += DOUBLE.getDouble(array, i) * DOUBLE.getDouble(array, i);
}
return Math.sqrt(l2norm);
}

@Description("euclidean distance between two dense vectors given as arrays")
@ScalarFunction("euclidean_distance_dense")
@SqlNullable
@SqlType(StandardTypes.DOUBLE)
public static Double euclideanDistanceDense(@SqlType("array(double)") Block leftArray, @SqlType("array(double)") Block rightArray)
{
Double squaredSum = squaredSumDifferences(leftArray, rightArray);
if (squaredSum == null) {
return null;
}

return Math.sqrt(squaredSum);
}

private static Double squaredSumDifferences(Block leftArray, Block rightArray)
{
int maxIndex = 0;
Double squaredSum = 0.0;

if (leftArray.getPositionCount() < rightArray.getPositionCount()) {
maxIndex = leftArray.getPositionCount();
squaredSum = squaredSumArray(rightArray, maxIndex);
}
else {
maxIndex = rightArray.getPositionCount();
squaredSum = squaredSumArray(leftArray, maxIndex);
}

if (squaredSum == null) {
return null;
}

for (int i = 0; i < maxIndex; i++) {
if (leftArray.isNull(i) || rightArray.isNull(i)) {
return null;
}
squaredSum += Math.pow(DOUBLE.getDouble(leftArray, i) - DOUBLE.getDouble(rightArray, i), 2);
}

return squaredSum;
}

private static Double squaredSumArray(Block array, int maxIndex)
{
Double squaredSum = 0.0;

for (int i = maxIndex; i < array.getPositionCount(); i++) {
if (array.isNull(i)) {
return null;
}
squaredSum += Math.pow(DOUBLE.getDouble(array, i), 2);
}

return squaredSum;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1350,6 +1350,54 @@ public void testCosineSimilarity()
null);
}

@Test
public void testCosineSimilarityDense()
{
assertFunction("cosine_similarity_dense(array [1.0E0, 2.0E0], array [1.0E0, 3.0E0])",
DOUBLE,
((1 * 1) + (2 * 3)) / (Math.sqrt(5) * Math.sqrt(10)));

assertFunction("cosine_similarity_dense(array [1.0E0, 2.0E0, -1.0E0], array [1.0E0, 3.0E0])",
DOUBLE,
(1 * 1 + 2 * 3) / (Math.sqrt(1 + 4 + 1) * Math.sqrt(1 + 9)));

assertFunction("cosine_similarity_dense(array [0, 0, 0], array [1.0E0, 3.0E0])",
DOUBLE,
Double.NaN);

assertFunction("cosine_similarity_dense(null, array [1.0E0, 3.0E0])",
DOUBLE,
null);

assertFunction("cosine_similarity_dense(array [1.0E0, null], array [1.0E0, 3.0E0])",
DOUBLE,
null);
}

@Test
public void testEuclideanDistanceDense()
{
assertFunction("euclidean_distance_dense(array [1.0E0, 2.0E0], array [1.0E0, 3.0E0])",
DOUBLE,
Math.sqrt(Math.pow((1 - 1), 2) + Math.pow((2 - 3), 2)));

assertFunction("euclidean_distance_dense(array [1.0E0, 2.0E0, -1.0E0], array [1.0E0, 3.0E0])",
DOUBLE,
Math.sqrt(Math.pow((1 - 1), 2) + Math.pow((2 - 3), 2) + Math.pow((-1 - 0), 2)));

assertFunction("euclidean_distance_dense(array [0, 0, 0], array [0, 0])",
DOUBLE,
0.0);

assertFunction("euclidean_distance_dense(null, array [1.0E0, 3.0E0])",
DOUBLE,
null);

assertFunction("euclidean_distance_dense(array [1.0E0, null], array [1.0E0, 3.0E0])",
DOUBLE,
null);
}

@Test
public void testInverseNormalCdf()
{
Expand Down
Loading