Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overlapping histograms #114

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 169 additions & 14 deletions R/descriptives.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ Descriptives <- function(jaspResults, dataset, options) {
if (options$plotVariables) {
if(is.null(jaspResults[["distributionPlots"]])) {
jaspResults[["distributionPlots"]] <- createJaspContainer(gettext("Distribution Plots"))
jaspResults[["distributionPlots"]]$dependOn(c("plotVariables", "splitby", "binWidthType", "distPlotDensity",
"distPlotRug", "numberOfBins"))
jaspResults[["distributionPlots"]]$dependOn(c("plotVariables", "splitby", "binWidthType", "overlay",
"distPlotDensity", "distPlotRug", "numberOfBins"))
jaspResults[["distributionPlots"]]$position <- 5
}

Expand All @@ -99,9 +99,9 @@ Descriptives <- function(jaspResults, dataset, options) {
for (var in variables) {
if(is.null(distPlots[[var]])) {
if (makeSplit) {
distPlots[[var]] <- .descriptivesFrequencyPlots(dataset = splitDat.factors, options = options, variable = var)
distPlots[[var]] <- .descriptivesFrequencyPlots(dataset = splitDat.factors, options = options, variable = var, overlayFactor = splitFactor)
} else {
distPlots[[var]] <- .descriptivesFrequencyPlots(dataset = dataset.factors, options = options, variable = var)
distPlots[[var]] <- .descriptivesFrequencyPlots(dataset = dataset.factors, options = options, variable = var, overlayFactor = splitFactor)
}
}
}
Expand Down Expand Up @@ -941,24 +941,55 @@ Descriptives <- function(jaspResults, dataset, options) {
}


.descriptivesFrequencyPlots <- function(dataset, options, variable) {
.descriptivesFrequencyPlots <- function(dataset, options, variable, overlayFactor) {
if (options$splitby != "" ) {
# return a collection
split <- names(dataset)

plotResult <- createJaspContainer(title = variable)
plotResult$dependOn(options = "splitby", optionContainsValue = list(variables = variable))
if (options$overlay) {

#overlayFactor = Descriptives.splitFactor#dataset[[.v(splitName)]]

plotResult <- createJaspContainer(title = variable)
plotResult$dependOn(c("splitby", "overlay"), optionContainsValue = list(variables = variable))

#for (l in split) {
plotResult <- .descriptivesFrequencyPlots_SubFunc_Overlay(dataset = dataset, variable = variable, overlayFactor = overlayFactor, splitByName = options$splitby, width = options$plotWidth, height = options$plotHeight, displayDensity = options$distPlotDensity, rugs = options$distPlotRug, title = variable, binWidthType = options$binWidthType, numberOfBins = options$numberOfBins)
plotResult$dependOn(optionsFromObject = plotResult)
#}

return(plotResult)

} else {

plotResult <- createJaspContainer(title = variable)
plotResult$dependOn(options = "splitby", optionContainsValue = list(variables = variable))

for (l in split) {
plotResult[[l]] <- .descriptivesFrequencyPlots_SubFunc(dataset = dataset[[l]], variable = variable, width = options$plotWidth, height = options$plotHeight, displayDensity = options$distPlotDensity, rugs = options$distPlotRug, title = l, binWidthType = options$binWidthType, numberOfBins = options$numberOfBins)
plotResult[[l]]$dependOn(optionsFromObject = plotResult)
}

return(plotResult)

for (l in split) {
plotResult[[l]] <- .descriptivesFrequencyPlots_SubFunc(dataset = dataset[[l]], variable = variable, width = options$plotWidth, height = options$plotHeight, displayDensity = options$distPlotDensity, rugs = options$distPlotRug, title = l, binWidthType = options$binWidthType, numberOfBins = options$numberOfBins)
plotResult[[l]]$dependOn(optionsFromObject = plotResult)
}

return(plotResult)
} else {
column <- dataset[[.v(variable)]]
aPlot <- .descriptivesFrequencyPlots_SubFunc(dataset = dataset, variable = variable, width = options$plotWidth, height = options$plotHeight, displayDensity = options$distPlotDensity, rugs = options$distPlotRug, title = variable, binWidthType = options$binWidthType, numberOfBins = options$numberOfBins)
aPlot$dependOn(options = "splitby", optionContainsValue = list(variables = variable))

if (options$overlay == TRUE) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (options$overlay == TRUE) {
if (options[["overlay"]]) {
  1. overlay is binary already
  2. using [[ over $ avoids partial matching, so it's good to prefer that.

overlayFactor = NULL
column <- dataset[[.v(variable)]]
aPlot <- .descriptivesFrequencyPlots_SubFunc_Overlay(dataset = dataset, variable = variable, overlayFactor=overlayFactor, width = options$plotWidth, height = options$plotHeight, displayDensity = options$distPlotDensity, rugs = options$distPlotRug, title = variable, binWidthType = options$binWidthType, numberOfBins = options$numberOfBins)
aPlot$dependOn(c("splitby", "overlay"), optionContainsValue = list(variables = variable))

} else {

column <- dataset[[.v(variable)]]
aPlot <- .descriptivesFrequencyPlots_SubFunc(dataset = dataset, variable = variable, width = options$plotWidth, height = options$plotHeight, displayDensity = options$distPlotDensity, rugs = options$distPlotRug, title = variable, binWidthType = options$binWidthType, numberOfBins = options$numberOfBins)
aPlot$dependOn(options = "splitby", optionContainsValue = list(variables = variable))

}


return(aPlot)
}
Expand All @@ -981,6 +1012,25 @@ Descriptives <- function(jaspResults, dataset, options) {
return(freqPlot)
}

.descriptivesFrequencyPlots_SubFunc_Overlay <- function(dataset, variable, overlayFactor, splitByName = splitByName, width, height, displayDensity, rugs, title, binWidthType, numberOfBins) {
freqPlot <- createJaspPlot(title = title, width = width, height = height)

errorMessage <- NULL #.descriptivesCheckPlotErrors(dataset, variable, obsAmount = "< 3")
column <- dataset[[variable]]
column <- column[!is.na(column)]
#print(do.call(rbind, dataset))

isDiscrete <- is.factor(column) || is.character(column)
if (!is.null(errorMessage))
freqPlot$setError(gettextf("Plotting not possible: %s", errorMessage))
else if (length(column) > 0 && isDiscrete)
freqPlot$plotObject <- .barplotJASP(column, variable)
else #if (length(column) > 0 && !isDiscrete)
freqPlot$plotObject <- .plotMarginal_Overlay(column = do.call(rbind, dataset), variableName = variable, overlayFactor, splitByName = splitByName, displayDensity = displayDensity, rugs = rugs, binWidthType = binWidthType, numberOfBins = numberOfBins)

return(freqPlot)
}

.descriptivesSplitPlot <- function(dataset, options, variable) {
depends <- c("splitPlotColour", "splitPlotViolin", "splitPlotBoxplot", "splitPlotJitter", "splitPlotOutlierLabel")

Expand Down Expand Up @@ -1239,6 +1289,111 @@ Descriptives <- function(jaspResults, dataset, options) {
return(p)
}



.plotMarginal_Overlay <- function(column, variableName, overlayFactor, splitByName,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.plotMarginal_Overlay <- function(column, variableName, overlayFactor, splitByName,
.plotMarginalOverlay <- function(column, variableName, overlayFactor, splitByName,

We use camelCase, so no underscores in the names, please. I know that jaspDescriptives does not follow this convention, but it's good to start doing it here.

rugs = FALSE, displayDensity = FALSE, binWidthType = c("doane", "fd", "scott", "sturges", "manual"),
numberOfBins = NA,
lwd = 1) {
binWidthType <- match.arg(binWidthType)
#column <- as.numeric(column)
variable <- column[[variableName]] #na.omit(column)

#message(variable)

if (length(variable) == 0)
return(NULL)

if (binWidthType == "doane") { # https://en.wikipedia.org/wiki/Histogram#Doane's_formula
sigma.g1 <- sqrt((6*(length(variable) - 2)) / ((length(variable) + 1)*(length(variable) + 3)))
g1 <- mean(abs(variable)^3)
k <- 1 + log2(length(variable)) + log2(1 + (g1 / sigma.g1))
binWidthType <- k
} else if (binWidthType == "fd" && nclass.FD(variable) > 10000) { # FD-method will produce extreme number of bins and crash ggplot, mention this in footnote
binWidthType <- 10000
} else if (binWidthType == "manual") {
binWidthType <- numberOfBins
}


h <- hist(variable, plot = FALSE, breaks = binWidthType)

if (!displayDensity)
yhigh <- max(h$counts)
else {
dens <- density(variable)
yhigh <- max(max(h$density), max(dens$y))
}

ylow <- 0
xticks <- base::pretty(c(variable, h$breaks), min.n = 3)

if (!displayDensity) {
p <-
jaspGraphs::drawAxis(
xName = variableName, yName = gettext("Counts"), xBreaks = xticks,
yBreaks = base::pretty(c(0, h$counts)), force = TRUE, xLabels = xticks
)
} else {
p <-
jaspGraphs::drawAxis(
xName = variableName, yName = gettext("Density"), xBreaks = xticks,
yBreaks = c(0, 1.05 * yhigh), force = TRUE, yLabels = NULL,
xLabels = xticks
)
}


if (displayDensity) {
p <- p +
ggplot2::geom_histogram(
data = data.frame(x = variable, g = overlayFactor),
mapping = ggplot2::aes(x = x, fill=factor(g), y = ..density..),#options$splitBy, y = ..density..),
Comment on lines +1350 to +1351
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps it makes sense to incorporate these changes here? If you prefer, I can also do that (either before or after this PR).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, I have a very slight preference for moving onto something else in the interest of getting broader exposure to different parts of JASP, but emphasis on the word 'slight' - I would also happily have a go at incorporating the changes in jaspGraphs. By the way I think it's worth mentioning that there were a couple of not-completely-resolved questions about whether this could be implemented more neatly, and what to do if there is no split factor. Are there differences between the required functionality or code standards in the jaspGraphs and jaspDescriptives modules?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a very slight preference for moving onto something else in the interest of getting broader exposure to different parts of JASP

Makes sense.

By the way I think it's worth mentioning that there were a couple of not-completely-resolved questions about whether this could be implemented more neatly, and what to do if there is no split factor.

This can definitely be done in one function (and should probably be done in one function).
Personally, I'd implement this somewhat like this:

.plotMarginal_Overlay <- function(column, variableName, overlayFactor = NULL, splitByName = NULL, ...) {

if (displayDensity) {
  if (is.null(overlayFactor)) {
    histogramData    <- data.frame(x = variable, g = overlayFactor)
    histogramMapping <- ggplot2::aes(x = x, fill=factor(g), y = ..density..)
  } else {
    histogramData    <- data.frame(x = variable)
    histogramMapping <- ggplot2::aes(x = x, y = ..density..)
  }

  p <- p +
    ggplot2::geom_histogram(
      data    = histogramData,
      mapping = histogramMapping,
      ...
    ) + ...
}

Are there differences between the required functionality or code standards in the jaspGraphs and jaspDescriptives modules?

Since jaspGraphs is reused by all of JASP, I tend to be a bit more strict in terms of functionality and documentation (e.g., enforcing roxygen2 for documentation). Also, all code there must pass the R CMD CHECK. but I can also do that if you prefer to work on other things!

breaks = h[["breaks"]],
position = 'identity',
alpha = 0.2,
col = "black",
size = .7
) +
ggplot2::geom_line(
data = data.frame(x = dens$x, y = dens$y),
mapping = ggplot2::aes(x = x, y = y),
lwd = lwd,
col = "black"
) + ggplot2::scale_fill_manual(name = splitByName, values = rainbow(nlevels(overlayFactor)))
Comment on lines +1358 to +1363
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be also good to split the density plot by the group


} else {


p <- p +
ggplot2::geom_histogram(
data = data.frame(x = variable, g = overlayFactor),
mapping = ggplot2::aes(x = x, fill = factor(g)),
breaks = h[["breaks"]],
position = "identity",
alpha = 0.2,
col = "black",
size = .7
) + ggplot2::scale_fill_manual(name = splitByName, values = rainbow(nlevels(overlayFactor)))

}

if (rugs)
p <- p + ggplot2::geom_rug(data = data.frame(variable), mapping = ggplot2::aes(x = variable), sides = "b")

# JASP theme
p <- jaspGraphs::themeJasp(p, legend.position = 'right',
axisTickWidth = .7,
bty = list(type = "n", ldwX = .7, lwdY = 1))
# TODO: Fix jaspgraphs axis width X vs Y. See @vandenman.

if (displayDensity)
p <- p + ggplot2::theme(axis.ticks.y = ggplot2::element_blank())

return(p)
}


.descriptivesDotPlots <- function(dataset, options, variable){


Expand Down
1 change: 1 addition & 0 deletions inst/qml/Descriptives.qml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ Form
enabled: plotVariables.checked || plotCorrelationMatrix.checked

indent: true
CheckBox { name: "overlay"; label: qsTr("Overlay") }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
CheckBox { name: "overlay"; label: qsTr("Overlay") }
CheckBox { name: "distPlotOverlay"; label: qsTr("Overlay split levels in one plot"); enabled: splitBy.count > 0 }
  1. change of the option name: It is good to make sure that the name of the option makes it clear which output it affects. In this case, the distribution plots.
  2. change the label: be more descriptive to the user
  3. enabled: enabling the option only if splitBy is not empty makes it clearer to the user what the button does.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The question in general is whether this option should be presented here as is, as it affects only the distribution plots but not the correlation plots. The alternative option would be to implement this split for the correlation plots as well (which would be ideal, I think). But this is already a good step forward, so I would not be that opposed to let it be like this. We can revisit the correlation plots in the future.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, it would be good to add description of this button in the help file (you can find the file inside inst/help/Descriptives.md).

CheckBox { name: "distPlotDensity"; label: qsTr("Display density") }
CheckBox { name: "distPlotRug"; label: qsTr("Display rug marks") }
DropDown {
Expand Down