Skip to content

Commit

Permalink
Add ConfidenceIntervalStrategy
Browse files Browse the repository at this point in the history
  • Loading branch information
zeotuan committed May 6, 2024
1 parent db9b764 commit 91b1728
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,17 @@ import com.amazon.deequ.metrics.DistributionValue
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestionWithValue
import com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule.defaultIntervalStrategy
import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WilsonScoreIntervalStrategy}
import org.apache.commons.lang3.StringEscapeUtils

import scala.math.BigDecimal.RoundingMode

/** If we see a categorical range for most values in a column, we suggest an IS IN (...)
* constraint that should hold for most values */
case class FractionalCategoricalRangeRule(
targetDataCoverageFraction: Double = 0.9,
categorySorter: Array[(String, DistributionValue)] => Array[(String, DistributionValue)] =
categories => categories.sortBy({ case (_, value) => value.absolute }).reverse
categories => categories.sortBy({ case (_, value) => value.absolute }).reverse,
intervalStrategy: ConfidenceIntervalStrategy = defaultIntervalStrategy
) extends ConstraintRule[ColumnProfile] {

override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
Expand Down Expand Up @@ -79,11 +80,8 @@ case class FractionalCategoricalRangeRule(

val p = ratioSums
val n = numRecords
val z = 1.96

// TODO this needs to be more robust for p's close to 0 or 1
val targetCompliance = BigDecimal(p - z * math.sqrt(p * (1 - p) / n))
.setScale(2, RoundingMode.DOWN).toDouble
val targetCompliance = intervalStrategy.calculateTargetConfidenceInterval(p, n).lowerBound

val description = s"'${profile.column}' has value range $categoriesSql for at least " +
s"${targetCompliance * 100}% of values"
Expand Down Expand Up @@ -128,3 +126,7 @@ case class FractionalCategoricalRangeRule(
override val ruleDescription: String = "If we see a categorical range for most values " +
"in a column, we suggest an IS IN (...) constraint that should hold for most values"
}

object FractionalCategoricalRangeRule {
private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WilsonScoreIntervalStrategy()
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._

import scala.math.BigDecimal.RoundingMode
import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WilsonScoreIntervalStrategy}

/**
* If a column is incomplete in the sample, we model its completeness as a binomial variable,
Expand All @@ -33,21 +32,15 @@ import scala.math.BigDecimal.RoundingMode
*/
case class RetainCompletenessRule(
minCompleteness: Double = defaultMinCompleteness,
maxCompleteness: Double = defaultMaxCompleteness
maxCompleteness: Double = defaultMaxCompleteness,
intervalStrategy: ConfidenceIntervalStrategy = defaultIntervalStrategy
) extends ConstraintRule[ColumnProfile] {
override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
profile.completeness > minCompleteness && profile.completeness < maxCompleteness
}

override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {

val p = profile.completeness
val n = numRecords
val z = 1.96

// TODO this needs to be more robust for p's close to 0 or 1
val targetCompleteness = BigDecimal(p - z * math.sqrt(p * (1 - p) / n))
.setScale(2, RoundingMode.DOWN).toDouble
val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval(profile.completeness, numRecords).lowerBound

val constraint = completenessConstraint(profile.column, _ >= targetCompleteness)

Expand Down Expand Up @@ -75,4 +68,5 @@ case class RetainCompletenessRule(
object RetainCompletenessRule {
private val defaultMinCompleteness: Double = 0.2
private val defaultMaxCompleteness: Double = 1.0
private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WilsonScoreIntervalStrategy()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package com.amazon.deequ.suggestions.rules.interval

import breeze.stats.distributions.{Gaussian, Rand}
import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence}

/**
* Strategy for calculate confidence interval
* */
trait ConfidenceIntervalStrategy {

/**
* Generated confidence interval interval
* @param pHat sample of the population that share a trait
* @param numRecords overall number of records
* @param confidence confidence level of method used to estimate the interval.
* @return
*/
def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval

def validateInput(pHat: Double, confidence: Double): Unit = {
require(0.0 <= pHat && pHat <= 1.0, "pHat must be between 0.0 and 1.0")
require(0.0 <= confidence && confidence <= 1.0, "confidence must be between 0.0 and 1.0")
}

def calculateZScore(confidence: Double): Double = Gaussian(0, 1)(Rand).inverseCdf(1 - ((1.0 - confidence)/ 2.0))
}

object ConfidenceIntervalStrategy {
val defaultConfidence = 0.95

case class ConfidenceInterval(lowerBound: Double, upperBound: Double)
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package com.amazon.deequ.suggestions.rules.interval

import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence}

import scala.math.BigDecimal.RoundingMode

/**
* Implements the Wald Interval method for creating a binomial proportion confidence interval.
*
* @see <a
* href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Normal_approximation_interval">
* Normal approximation interval (Wikipedia)</a>
*/
case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy {
def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = {
validateInput(pHat, confidence)
val successRatio = BigDecimal(pHat)
val marginOfError = BigDecimal(calculateZScore(confidence) * math.sqrt(pHat * (1 - pHat) / numRecords))
val lowerBound = (successRatio - marginOfError).setScale(2, RoundingMode.DOWN).toDouble
val upperBound = (successRatio + marginOfError).setScale(2, RoundingMode.UP).toDouble
ConfidenceInterval(lowerBound, upperBound)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.amazon.deequ.suggestions.rules.interval

import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence}

import scala.math.BigDecimal.RoundingMode

/**
* Using Wilson score method for creating a binomial proportion confidence interval.
*
* @see <a
* href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval">
* Wilson score interval (Wikipedia)</a>
*/
case class WilsonScoreIntervalStrategy() extends ConfidenceIntervalStrategy {

def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = {
validateInput(pHat, confidence)
val zScore = calculateZScore(confidence)
val zSquareOverN = math.pow(zScore, 2) / numRecords
val factor = 1.0 / (1 + zSquareOverN)
val adjustedSuccessRatio = pHat + zSquareOverN/2
val marginOfError = zScore * math.sqrt(pHat * (1 - pHat)/numRecords + zSquareOverN/(4 * numRecords))
val lowerBound = BigDecimal(factor * (adjustedSuccessRatio - marginOfError)).setScale(2, RoundingMode.DOWN).toDouble
val upperBound = BigDecimal(factor * (adjustedSuccessRatio + marginOfError)).setScale(2, RoundingMode.UP).toDouble
ConfidenceInterval(lowerBound, upperBound)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package com.amazon.deequ.suggestions.rules.interval

import com.amazon.deequ.SparkContextSpec
import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval
import com.amazon.deequ.utils.FixtureSupport
import org.scalamock.scalatest.MockFactory
import org.scalatest.wordspec.AnyWordSpec

class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkContextSpec
with MockFactory {
"WaldIntervalStrategy" should {
"be calculated correctly" in {
assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(1.0, 20L) == ConfidenceInterval(1.0, 1.0))
assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.5, 100L) == ConfidenceInterval(0.4, 0.6))
assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.4, 100L) == ConfidenceInterval(0.3, 0.5))
assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.6, 100L) == ConfidenceInterval(0.5, 0.7))
assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.90, 100L) == ConfidenceInterval(0.84, 0.96))
assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(1.0, 100L) == ConfidenceInterval(1.0, 1.0))
}
}

"WilsonIntervalStrategy" should {
"be calculated correctly" in {
assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(1.0, 20L) == ConfidenceInterval(0.83, 1.0))
assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.5, 100L) == ConfidenceInterval(0.4, 0.6))
assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.4, 100L) == ConfidenceInterval(0.3, 0.5))
assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.6, 100L) == ConfidenceInterval(0.5, 0.7))
assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.90, 100L) == ConfidenceInterval(0.82, 0.95))
assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(1.0, 100L) == ConfidenceInterval(0.96, 1.0))
}
}
}

0 comments on commit 91b1728

Please sign in to comment.