Skip to content

Commit

Permalink
Replace rdd with dataframe functions in Histogram analyzer (#586)
Browse files Browse the repository at this point in the history
Co-authored-by: Shriya Vanvari <[email protected]>
  • Loading branch information
shriyavanvari and svanvari authored Oct 1, 2024
1 parent 9002bb8 commit 0f46385
Showing 1 changed file with 18 additions and 3 deletions.
21 changes: 18 additions & 3 deletions src/main/scala/com/amazon/deequ/analyzers/Histogram.scala
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,26 @@ case class Histogram(
case Some(theState) =>
val value: Try[Distribution] = Try {

val topNRows = theState.frequencies.rdd.top(maxDetailBins)(OrderByAbsoluteCount)
val countColumnName = theState.frequencies.schema.fields
.find(field => field.dataType == LongType && field.name != column)
.map(_.name)
.getOrElse(throw new IllegalStateException(s"Count column not found in the frequencies DataFrame"))

val topNRowsDF = theState.frequencies
.orderBy(col(countColumnName).desc)
.limit(maxDetailBins)
.collect()

val binCount = theState.frequencies.count()

val histogramDetails = topNRows
.map { case Row(discreteValue: String, absolute: Long) =>
val columnName = theState.frequencies.columns
.find(_ == column)
.getOrElse(throw new IllegalStateException(s"Column $column not found"))

val histogramDetails = topNRowsDF
.map { row =>
val discreteValue = row.getAs[String](columnName)
val absolute = row.getAs[Long](countColumnName)
val ratio = absolute.toDouble / theState.numRows
discreteValue -> DistributionValue(absolute, ratio)
}
Expand Down

0 comments on commit 0f46385

Please sign in to comment.