analysis.Rnw

% -*- mode: noweb; noweb-default-code-mode: R-mode; -*-

\SweaveOpts{ results=hide}
\SweaveOpts{ include=FALSE}
\SweaveOpts{ echo=FALSE}
\SweaveOpts{ engine=R}
\SweaveOpts{ keep.source= TRUE}
\SweaveOpts{ eval=FALSE}
\SweaveOpts{ eval=TRUE}


\graphicspath{ {analysis/} }

\chapter{Analysis}
\label{cha:analysis}


<<init>>=

options( prompt= " ", continue= " ", width= 60)
options(error= function(){
  ## recover()
  options( prompt= "> ", continue= "+ ", width= 80)
})
  
source( "~/thesis/code/peel.R")
source( "~/thesis/code/maps.R")

   texWd <- path.expand( "~/thesis/analysis")
rasterWd <- path.expand( "~/thesis/data/analysis")
dataPath <- path.expand( "~/thesis/data")
setwd( rasterWd)

overwriteRasters <- TRUE
overwriteFigures <- TRUE

                                        # studyArea used to work out RMSE
                                        # calcs and tables
##studyArea <- "thumb"
studyArea <- "mlct"
                                        # bands are numbered from one but
                                        # classes from zero.  Used for stacks/brick
                                        # where bands correspond to classes
peelBands <- peelClasses +1

                                        # mask and agland exported from GRASS
                                        # no need to mask or crop
cusaMask <- raster( sprintf( "%s/mask_cusa.tif",
                            dataPath))
cusaExtent <- extent( cusaMask)
thumbExtent <- extent( -( 83 +30 /60), -( 82 +25 /60),
                          42 +55 /60,     44  +5 /60 )

                                        # default raster() output
                                        # has geographic proj, full extent
                                        # by default
world <- raster()
res(world) <- 5/60

grid <- raster( cusaMask)
grid[] <- cellsFromExtent( world, grid)
grid <- raster::mask( grid, cusaMask)

nulls <- raster( cusaMask)
nulls[] <- NA

zeroes <- raster( cusaMask)
zeroes[] <- 0

ones <- raster( cusaMask)
ones[] <- 1

if( studyArea == "thumb") {
  cusaMask <- crop( cusaMask, thumbExtent)
}
acresFile <- paste( "acres",
                   paste( studyArea, ".tif", sep=""),
                   sep="_")
if( overwriteRasters) {
  acres <- area( cusaMask) *247.105381
  acres <- writeRaster( acres,
                       filename= acresFile,
                       overwrite= TRUE)
} else acres <- raster( acresFile)

agland <- stack( list.files( paste( dataPath, "agland", sep="/"),
                            patt= "(cropland|pasture).tif$",
                            full.names= TRUE))
layerNames(agland) <- c("crop", "open")
agland <- setMinMax( agland)
aglandCrop <- unstack( agland)[[ 1]]
if( studyArea == "thumb") {
  agland <- crop( agland, thumbExtent)
}

agg05 <-
  brick( list.files( dataPath,
                    patt= paste( studyArea, "_Amin_0.5_agg.tif", sep=""),
                    full.names= TRUE))
layerNames( agg05) <- names( peelClasses)

nomos05 <-
  brick( list.files( dataPath,
                    patt= paste( studyArea, "_Amin_0.5_nomosaic.tif", sep=""),
                    full.names= TRUE))
layerNames( nomos05) <- c( names( peelClasses)[ -8], "total")

agg1 <-
  brick( list.files( dataPath,
                    patt= paste( studyArea, "1_agg.tif", sep=""),
                    full.names= TRUE)) 
layerNames( agg1) <- names( peelClasses)

nomos1 <-
  brick( list.files( dataPath,
                    patt= paste( studyArea, "1_Amin_1_nomosaic.tif", sep=""),
                    full.names= TRUE))
layerNames( nomos1) <- c( names( peelClasses)[ -8], "total")

nlcd <-
  brick( sapply( names( peelClasses),
                function( cover) {
                  if( cover == "mosaic") {
                    zeroes
                  } else {
                    fn <-
                      list.files( paste( dataPath, "nlcd",
                                        sep = "/"),
                                 patt= paste( "nlcd", cover, "5min.tif$",
                                             sep = "_"),
                                 full.names= TRUE)
                    crop( raster( fn), cusaMask)
                  }}))


nlcd <- writeRaster( nlcd,
                    filename= paste( path.expand(rasterWd),
                      "nlcd.tif",
                      sep= "/"),
                    overwrite= TRUE)

layerNames( nlcd) <- names( peelClasses)


rasterNames <- c( "agland", "nlcd", "agg05", "agg1", "nomos05", "nomos1")

dataSets <- sapply( rasterNames, function( n) eval( parse( text=n)))

areas <- llply( dataSets,
function( d) {
  res <- cellStats( d *acres, sum)
  names( res) <- layerNames( d)
  res
})

areasDf <-
  ldply( areas, function( a) {
    melt( t( as.data.frame( a)))
  })
areasDf <-
  areasDf[, c( 1, 3, 4)]
colnames( areasDf) <-
  c( "map", "class", "acres")
areasDf$map <-
  factor( areasDf$map,
         levels= rasterNames)

legendOrder <- rev( c( 6, 4, 2, 3, 9, 7, 5, 1, 8))

areasDf$class <-
  factor( areasDf$class,
         levels= c( names( peelLegend)[ rev( legendOrder)], "total"))

if( overwriteFigures) areasPlot <-
  qplot( map, acres /10^6,
        data= subset(areasDf, class != "total"),
        geom="bar", position= "stack",
        fill= class,
        stat="summary", fun.y="sum") +
  scale_fill_manual( "",
                    values= peelLegend[ legendOrder],
                    ##peelLegend[ levels( areasDf$class)[1:9]],
                    breaks= names( peelLegend)[ legendOrder]) +
  scale_y_continuous( "M acres",
      limits= c(0,2000)) +
  theme_bw( base_family= "serif") +
  scale_x_discrete( "",
      limits= rasterNames[ c( 1, 2, 4, 3, 6, 5)],
      breaks= rasterNames[ c( 1, 2, 4, 3, 6, 5)],
      labels= expression("Agland2000", "NLCD",
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==1.0)),
               phantom(0)),
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==0.5)),
               phantom(0)),
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==1.0)),
               "No Mosaic"),
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==0.5)),
               "No Mosaic")))

areasCt <- cast( areasDf, class ~ map,
                value= "acres",
                subset= class != "total",
                sum,
                margins="grand_row")[, -1]
rownames( areasCt) <- levels( areasDf$class)

@ %def 

In this chapter we will describe a procedure for combining information
from the data sets described in \autoref{cha:datasets} using the same
sub-pixel analysis data structure at $5'$ resolution for the
conterminous USA9 (cUSA) to produce a data set that exhibits high
accuracy in the distribution of agricultural production according to
the Agland2000 data set introduced in \autoref{sec:agland2000}, that
provides a realistic characterization of other uses and covers as
suggested by MLCT data from \autoref{sec:mlct} and particular aspects
of the NLCD from \autoref{sec:nlcd}.

In this chapter we will evaluate the progress of the analysis in terms
of areas given both in millions of acres (Ma) and millions of hectare
(Mha).  It is important to note that these areas cannot be computed
directly from the geographic grid in which the data is contained and
our maps are rendered.  Because these $5'$ grid cells are actually
sections of a spheroid projected onto a plane, the areas that a given
cell encompasses is not constant.  A first-order approximation of
these areas can be obtained as a function of the earth's mean radius
and the cosine of a cell's latitude.  In this way areas are maximum at
the equator and approach zero as position approaches the poles.
Conveniently Hijmans' \texttt{raster} package for the \texttt{R}
analysis software provides a function \texttt{area()} that accepts any
raster data set in geographical coordinates (longitude, latitude) as
an input, producing a new raster data set whose values are the areas
of the former in km$^{\textrm{2}}$.  It is a simple matter to convert these to
acres by subsequently scaling that result by a constant.  See the
source code in the appendix for further details.

We start by tabulating the aggregate areas by PEEL class for the data
sets that we are using as inputs, MLCT, NLCD, and Agland2000.  We
evaluate the accuracy of cropland distribution in the MLCT data as a
function of two values of the $A_{min}$ parameter, which is defined as
the minimum fraction of an MLCT pixel at its native resolution
assigned to the primary class prior to aggregation to the $5'$ PEEL
model analysis grid.  $A_{min}=1.0$ represents consideration of only
the primary class.  $A_{min}=0.5$ indicates that in the hypothetical
situation of zero classification confidence the primary class would be
assigned half of the area of that particular MLCT pixel, therefore
this value represents maximum incorporation of the secondary class in
our analytical framework, as modulated by the MLCT classification
confidence data.  These intermediate results are compared on the basis
of root mean squared error (RMSE) metrics calculated relative to the
distribution of cropland given in the Agland2000 data at the end of
\autoref{sec:comparison}.

In \autoref{sec:nlcd_offsets} we describe a method for selectively
incorporating cover fractions for particular classes from the NLCD
data set due to a perceived underestimation of those classes by MLCT
due primarily to its lower resolution.  Those classes are water,
wetland, and urban.  In the PEEL classification the ``urban'' class is
broadened to include rural infrastructure that MLCT effectively counts
as cropland.  Accepting NLCD's quantification of these classes as
truth is intended to counteract a perceived overestimation of cropland
area in MLCT caused in part by a discrepancy in formulation of these
data sets, Agland2000 representing actual harvested areas and MLCT
catching up lots of ancillary land that may be associated with
cultivated land but is not directly involved in crop production. The
result is an adjusted version of the MLCT data as amended by these
NLCD offsets.

Section~\ref{sec:fusion} presents the results of fusing our adjusted MLCT
map with the Agland2000 data by accepting the Agland2000 value for
cropland as truth where possible and scaling other classes
proportionally to describe the remainder of the landscape for
purposes of the PEEL model.  This operation is constrained by our
decision to retain the NLCD offsets as firm figures for those classes
in order to account for varying degrees of infrastructure development
represented by the so-called urban class and water or wetland features
not resolved in the MLCT data.  Where Agland2000 conflicts with this
constraint the cropland fraction is reduced accordingly.

Finally \autoref{sec:peel} shows how information from the
\texttt{175Crops2000} data set is used to disaggregate the cropland
given by the result of the previous step in order to provide a rough
characterization of the distribution of production of major crop
commodities, corn (maize), soybean, wheat, rice, sugarcane, other
cereals, and other field crops.  This is important for the PEEL model
because the intent is to model transitions in production in response
to forecast commodity prices in addition to other drivers.

Through the offset and fusion steps we use decreasing RMSE figures to
show that our complete characterization of the landscape is improving
in accuracy with respect to \texttt{Agland2000}, the census-based
distribution of productive cropland so that when other cover classes
are scaled the distortions are minimized.


\section{Comparison of Aggregate Areas}
\label{sec:comparison}


@
<<tab_areas, results=tex, eval=TRUE>>=


local({
  colnames( areasCt) <- c( "Agland2000", "NLCD",
                          "\\pbox[c][][c]{3in}{Aggregated\\\\$A_{min}=0.5$}",
                          "\\pbox[c][][c]{3in}{Aggregated\\\\$A_{min}=1.0$}",
                          "\\pbox[c][][c]{3in}{No Mosaic\\\\$A_{min}=0.5$}",
                          "\\smallskip\\pbox[c][][c]{3in}{No Mosaic\\\\$A_{min}=1.0$}")
  print( xtable( areasCt / 10^6, 
                caption= "Total Acreages by Map and Cover", 
                label= "tab:areas",
                digits= 1),
        add.to.row= list( 
          pos= list( 0, nrow( areasCt)),
          command= rep("\\noalign{\\smallskip}", times= 2)),
        size= "small",
        sanitize.colnames.function= function(x) x)
})


@ %def 

\begin{figure}[ht] 
  \centering

<<fig_areas>>= 

if( overwriteFigures) {  
  setwd( texWd)
  my.ggsave( texWd, "fig_areas.pdf",
         device= pdf,
         plot= areasPlot,
         width= 6,
         height=6)
}


@ 

  \includegraphics{fig_areas}
  \caption{Total Acreages by Map and Cover}
  \label{fig:areas} 
\end{figure} 


After decomposing the mosaic class MLCT indicates
\Sexpr{printAreas(areasCt["crop","nomos05"])} of cropland for
$A_{min}=0.5$ and \Sexpr{printAreas(areasCt["crop","nomos1"])} for
$A_{min}=1.0$ in the cUSA in 2001.  Aglands2000 indicates roughly
\Sexpr{printAreas(areasCt["crop","agland"])} of cropland.  The
inability of the MLCT data set to resolve rural transportation
networks, minor settlements, and small water or wetland features is a
major contribution to the surplus of cropland acreage indicated by the
MLCT.  Due to its greater resolution, ~30m vs. ~500m, the NLCD is
better suited at discerning developed areas in rural landscapes
ranging from rural roads to farmsteads to small communities that do
not show up in the MLCT data. There is a total area of roughly
\Sexpr{printAreas(areasCt["urban","nlcd"]-areasCt["urban","agg05"])}
of development remaining after subtracting the MLCT urban class from
all developed classes in the NLCD after they have both been aggregated
to the $5'$ grid. Applying this area as an offset to the cropland
area in Aglands2000 brings us closer to the expected acreage under
cultivation in 2001, although this assumes that all of that
development intersects with MLCT cropland area.

The purpose for processing the MLCT for two values of $A_{min}$ as
described in \autoref{cha:datasets} was to evaluate whether or not
information from the secondary cover type contributes positively to
the accuracy of the data set we seek to synthesize.  The primary
objective of this synthesis is to achieve accuracy in cropland
distribution.  Because the cropland layer in the Agland2000 data set
is derived from county-level production census statistics we adopt
this as the ground truth and will endeavor to adjust our product
accordingly.  Although MLCT overstates cropland acreage for both
$A_{min}=0.5$ and $A_{min}=1.0$ the discrimination among the two is made
by the distribution of errors rather than the aggregate error.

<<nomosDiff>>=

if( overwriteFigures) {
  nomosDiff1 <- getPeelBand( nomos1, "crop") -aglandCrop
  layerNames( nomosDiff1) <- "crop"
  nomosDiffPlot1 <- coverMaps( nomosDiff1, classes= "crop", samp= 0.2) +
    scale_fill_gradientn( "diff", colours= rev( brewer.pal( 11, "BrBG")), 
                         limits= c( 1, -1),
                         breaks= seq( 1, -1, by= -0.2))
  nomosDiff05 <- getPeelBand( nomos05, "crop") -aglandCrop
  layerNames( nomosDiff05) <- "crop"
  nomosDiffPlot05 <- coverMaps( nomosDiff05, classes= "crop", samp= 0.2) +
    scale_fill_gradientn( "diff", colours= rev( brewer.pal( 11, "BrBG")), 
                         limits= c( 1, -1),
                         breaks= seq( 1, -1, by= -0.2))
}

@ 

\begin{figure}[ht] 
  \centering

<<fig_nomosDiff1>>=
if( overwriteFigures) {
  my.ggsave( texWd, "fig_nomosDiff1.png", plot= nomosDiffPlot1)
}
@ 

  \includegraphics{fig_nomosDiff1}
  \caption{Difference between MLCT (no mosaic, $A_{min}=1.0$) and Agland2000 crop}
  \label{fig:nomosDiff1} 
\end{figure} 

\begin{figure}[ht] 
  \centering

<<fig_nomosDiff05>>=
if( overwriteFigures) {
  my.ggsave( texWd, "fig_nomosDiff05.png", plot= nomosDiffPlot05)
}
@ 

  \includegraphics{fig_nomosDiff05}
  \caption{Difference between MLCT (no mosaic, $A_{min}=0.5$) and Agland2000 crop}
  \label{fig:nomosDiff05} 
\end{figure} 

\autoref{fig:nomosDiff1} and \autoref{fig:nomosDiff05} show the
cell-by-cell differences between the MLCT-derived data set that we
have calculated after mosaic decomposition and the Agland2000 cropland
map.  To summarize and compare these errors we calculate the root of
the mean squared error (RMSE) given by:

$$
\operatorname{RMSE}=\sqrt{\frac{\sum_{i=1}^{n}(\hat\theta_i-\theta_i )^2}{n}}
$$

where $\hat\theta_i$ are the predictions derived from the respective
MLCT derivations and $\theta_i$ are the observations taken from the
Agland2000 data set.


@
<<rmse>>=


rmseDf <- ldply( list("nomos05", "nomos1"),
                function( brickName) {
                  rmseRast( getPeelBand( get( brickName), "crop"),
                           aglandCrop)
                })
rmseDf <- cbind( c( 0.5, 1.0), rmseDf)
colnames( rmseDf) <- c( "$A_{min}$", "RMSE")

cropScatDf <- 
  data.frame( as( stack( getPeelBand( nomos05, "crop"),
                        getPeelBand( nomos1, "crop"),
                        aglandCrop,
                        raster::mask(acres, cusaMask)),
                 "SpatialGridDataFrame"))
colnames(cropScatDf) <-
  c( "nomos05", "nomos1", "agland", "acres", "lon", "lat")
cropScatDf$weight <- with( cropScatDf, acres/ max(acres))


if( overwriteFigures) hexPlot1 <-
  ggplot( data= cropScatDf,
       aes( agland, nomos1)) +
  stat_binhex( binwidth= c( 0.025, 0.025)) +
  scale_fill_gradientn( colours= brewer.pal( 6, "YlGn"),
                       trans= "log10",
                       limits=c( 10, 10000)) +
  geom_abline( alpha=0.4) +
  scale_x_continuous( "Agland2000",
                     expand= c( 0,0.0125)) +
  scale_y_continuous( expression( paste("MLCT, ", A[min] == 1.0)),
                     expand= c( 0,0.0125)) +
  theme_bw( base_family= "serif") +
  coord_equal() +
  opts( panel.grid.minor= theme_blank(),
        panel.grid.major= theme_blank(),
        panel.background= theme_blank())

@


\begin{figure}[ht]
  \centering

<<fig_hexplot1>>= 

if( overwriteFigures) {
  my.ggsave( texWd, "fig_hexPlot1.pdf",
         dev= pdf,
         plot= hexPlot1)
         ## width= 4.5,
         ## height= 4.5)
}

@ 

\includegraphics{fig_hexPlot1}
  \caption{Hexbin plot of MLCT crop ($A_{min}=1.0$, no mosaic) versus Agland2000 cropland}
  \label{fig:hexplot1} 
\end{figure} 

To examine the relationships between the distributions of cropland
that we derive from the MLCT data relative to the Agland2000 data we
will use ``hexbin'' plots which are essentially two-dimensional
histograms that show the number of grid cells that occur within
discrete regions of the space defined by coordinates that are cropland
fractions for the two data sets.  This operates much like a common
scatter plot but for data sets with as many observations as we wish to
include it gives a cleaner representation of that structure.  For our
plots we have chosen to employ a logarithmic scale because of the wide
range of counts calculated for the bins.  This gives a more complete
picture of the overall dispersion and local concentration of the
observations.  Our first example of such a plot is
\autoref{fig:hexplot1} which plots the crop fractions of MLCT with
$A_{min}=1$ versus those of the Agland2000 crop map.  As one would
expect there is an overall correlation among these variables,
especially given that Agland2000 provides prior probabilities to the
MLCT classification.  It is clear that the MLCT primary class
exhibits a positive bias overall, although a subset that is negatively
biased is also apparent for low values of the Agland2000 crop fraction
in the interval $[0.1,0.5]$.  Also of particular note is the drastic
decrease in correlation when Aglands2000 reaches 1.0 relative to the
stronger relationship over the interval $[0.8,1.0)$.  It is difficult
to speculate on the nature of this structure, but suffice it to say
that there is something peculiar about the Agland2000 allocation
procedure that drives the crop fraction to its maximum in areas where
the remote sensing data clearly resists such a characterization.  This
may be caused by systematic errors in the agricultural census data
that drive the Agland2000 algorithm forcing unrealistically high
concentrations in order to satisfy the algorithm's constraints. 
 

\begin{figure}[ht]
  \centering


<<fig_hexplot05>>= 

if( overwriteFigures) {
  my.ggsave( texWd, "fig_hexPlot05.pdf",
         device= pdf,
         plot= hexPlot1 +
            aes(agland, nomos05) +
            scale_y_continuous( expression(paste("MLCT, ", A[min] == 0.5)),
                               limits= c( 0, 1),
                               breaks= seq( 0, 1, by= 0.2),
                               expand= c( 0,0.0125)))
  }


@ 

\includegraphics{fig_hexPlot05}
  \caption{Hexbin plot of MLCT crop ($A_{min}=0.5$, no mosaic) versus Agland2000 cropland}
  \label{fig:hexplot05} 
\end{figure} 

@ 
<<table_rmse, results=tex, eval=TRUE>>=

print( xtable( rmseDf,
              caption= "RMSE, MLCT vs. Agland2000 crop",
              label= "tab:rmse",
              digits= c( 0, 1, 3)),
      include.rownames= FALSE,
      sanitize.colnames.function= function(x) x)

@ %def 

\todo{Consider also calculating statistical bias (average error)?}

We expect that setting $A_{min}=1$ will produce a maximum overall bias
and attendant error by assigning entire pixels to the cropland class
and not allowing for the possibility of mixed covers.  The results on
\autoref{tab:rmse} indicate that $A_{min}=0.5$ is more representative
of the distribution of cropland because although the total area
indicated is higher according to \autoref{tab:areas}, there is less
error on a cell-by-cell basis indicating that it does a better job of
representing the spatial distribution than $A_{min}=1.0$.  This is
reflected in the structure revealed by \autoref{fig:hexplot05} where
fewer cells in the MLCT data are set at 100\% crop because of
including the secondary class in calculating $5'$ coverage fractions.
Where crop was included in a secondary class it also caused cells of
near-zero value for MLCT to lift away from the x-axis.  The
uncorrelated observations for Agland2000 equal to 1.0 are still
present, however.  This result is adequate for our purposes to
determine that our logic in considering the secondary class in the
manner we have for $A_{min}=0.5$ is correct.  From this point forward
we will consider only the statistics derived from setting
$A_{min}=0.5$ for the aggregation of the MLCT data due to this
improved fit with Agland2000 cropland and its full consideration of
all information imparted by the MLCT data.


\section{NLCD Offsets}
\label{sec:nlcd_offsets}


From \autoref{tab:areas} it is apparent that the MLCT results are
negatively biased in the total areas assigned to water, wetland, and
urban features relative to the NLCD.  It is clear from visual
inspection that features of these classes tend to have smaller
characteristic dimensions which causes them to be overlooked in the
MLCT data due to its resolution.  The most obvious example is the
rural transportation networks in areas surveyed under the Public Land
Survey System (PLSS) where roads have been laid out on a generally
regular grid of square miles.  In the PEEL classification this
infrastructure is included in the urban class as another form of
developed land, perhaps making ``urban'' somewhat of a misnomer, but
it hails to its origins in the IGBP classification scheme and provides
a short label, a great convenience in programming.  It is important to
represent wetlands and water features in our input to the PEEL model
because these areas have high likelihoods of being set aside for
conservation purposes, which would be represented as a constraint on
land conversion in the model.  In the event that NLCD overestimates
these areas it would be an acceptable error to carry over to the PEEL
model in order to be conservative in allowing for conservation
measures in a greater number of grid cells, absent more precise LULC
data with respect to the water and wetland classes.

To merge this information from the NLCD we begin by simply accepting
the areas for water, wetland, and urban classes in the reclassified,
$5'$-aggregated version of NLCD that we have computed as truth and
calculate offsets for those classes versus our $5'$ MLCT data by
straight subtraction.  Where NLCD is greater the difference will be
positive and so a positive offset will be added to the fraction
already present for any one of the ``truth'' classes from NLCD.  The
other classes are then adjusted so that they are present in proportion
to each other as indicated by MLCT but in the area remaining after
accepting the water, wetland, and urban areas from NLCD.  The additive
offsets needed to achieve this balance and account for the entire area
of the cell are calculated so that the effects of this process on all
classes may be considered on a common basis.  

For the calculation of the offsets we drop back to the result of
aggregating the MLCT data to $5'$ with $A_{min}=0.5$ prior to mosaic
decomposition.  Presumably there are rural roads comprised of 30m NLCD
pixels cutting through the lower-resolution MLCT pixels including
those classified as mosaic.  In fact, by its very nature as a hybrid
class made up of natural cover and agricultural land use we expect
roads to be an important component of the landscape.
\autoref{fig:offsets1} and \autoref{fig:offsets2} show the spatial
distributions of the offsets calculated based on our assumptions about
the water, wetland, and urban classes in the NLCD.  We have verified
that these offsets sum to zero for each grid cell.  Any area deducted
from one class must be added to one or more classes in the same cell
in order to conserve the total area and maintain the sum of the
fractions at 1.0.

@
<<offsets_calc>>=

nlcdKeep <- stack( llply( names( peelClasses), function( class) {
  if( class %in% c( "water", "wetland", "urban"))
    ones else zeroes
}))

nlcdIgnore <- stack( llply( names( peelClasses), function( class) {
  if( class %in% c( "water", "wetland", "urban"))
    zeroes else ones
}))

nlcdKeepOffsets <-
  (nlcd -agg05) *nlcdKeep

mlctKeep <- agg05 *nlcdIgnore

nlcdIgnoreOffsets <-
  overlay( mlctKeep, sum( mlctKeep), sum( nlcdKeepOffsets),
          fun= function( mk, smk, snko) {
            ifelse( mk == 0 & smk ==0,
                   0,
                   -1 *mk /smk *snko)
          })


nlcdOffsets <- nlcdKeepOffsets +nlcdIgnoreOffsets

nlcdOffsets <- 
  writeRaster( nlcdOffsets,
              filename= paste( rasterWd, "nlcdOffsets.tif", sep= "/"),
              overwrite= TRUE)


nlcdOffsets <- stack( nlcdOffsets, sum( nlcdOffsets))
layerNames( nlcdOffsets) <- c( names( peelClasses), "total")

thumbNlcdOffsets <- crop( nlcdOffsets, thumbExtent)

offsetsMap1 <- coverDiffMaps( nlcdOffsets, samp= 0.4,
                             classes= layerNames( nlcdOffsets)[ 1:5]) +
               coord_equal()

offsetsMap2 <- coverDiffMaps( nlcdOffsets, samp= 0.4,
                             classes= layerNames( nlcdOffsets)[ 6:10]) +
               coord_equal()


thumbOffsetsMap <-
  coverDiffMaps( thumbNlcdOffsets,
                classes= layerNames( thumbNlcdOffsets)[-10]) +
  facet_wrap( ~variable)


@ 

\begin{figure}[h]
  \centering

@ 
<<fig_offsetsmap1>>=

if( overwriteFigures) {
  my.ggsave( texWd, "fig_offsets1.png", plot= offsetsMap1, height= 7)
}

@ %def 

\includegraphics{fig_offsets1}
  \caption{NLCD offsets}
  \label{fig:offsets1} 
\end{figure} 


\begin{figure}[h]
  \centering

@ 
<<fig_offsets2>>=

if( overwriteFigures) {
  my.ggsave( texWd, "fig_offsets2.png", plot= offsetsMap2, height= 7)
}

@ %def 

\includegraphics{fig_offsets2}
  \caption{NLCD offsets (cont.)}
  \label{fig:offsets2} 
\end{figure} 


The maps of these offsets are shown using a logarithmic scale in
order to bring attention both to areas of significant adjustment,
greater than 10\%, as well as to show the extent to which small
adjustments on the order of 1--5\% occur.  From these maps we can see
the detailed structure of drainage networks in the water class and
population centers in the urban class which could easily be confused
with the vegetative classes in the MLCT classification.  This refers,
for example, to heavily wooded suburbs where transportation
infrastructure is obscured and difficult to resolve.  The offsets for
the NLCD truth classes are generally positive, although not strictly
so because the algorithm does not preclude the possibility that MLCT
may locally overestimate these classes in particular regions and still
suffer an aggregate deficit relative to NLCD.
@ 
<<cor_offsets>>=

corOffsets <- cor( data.frame(as( nlcdOffsets, "SpatialGridDataFrame"))[, 1:9],
                  use= "complete.obs")
colnames( corOffsets) <- names( peelClasses)
rownames( corOffsets) <- names( peelClasses)

corOffsetsPlot <- 
  ggplot( melt( corOffsets),
         aes( x=X1, y=X2, fill= value)) +
  geom_tile() +
  theme_bw( base_family= "serif") +
  opts( panel.grid.minor= theme_blank(),
       panel.grid.major= theme_blank(),
       panel.background= theme_blank(),
       axis.title.x= theme_blank(),
       axis.text.x= theme_text( angle= 90, hjust=1),
       axis.title.y= theme_blank()) +
  scale_x_discrete( limits= colnames( corOffsets)) +
  scale_y_discrete( limits= colnames( corOffsets)) +
  scale_fill_gradientn( "", colours= rev( brewer.pal( 11, "BrBG")), 
                       limits= c( 1.0, -1.0),
                       breaks= seq( 1.0, -1.0, by= -0.2))

if( overwriteFigures) {
  oldWd <- setwd( texWd)
  ggsave( "fig_corOffsets.pdf",
         device= pdf,
         plot= corOffsetsPlot,
         height= 4.5,
         width= 4.5)
  setwd( oldWd)
}


@ %def 


\begin{figure}[ht]
  \centering
    \includegraphics{fig_corOffsets}
  \caption{Correlation matrix of NLCD offsets}
  \label{fig:corOffsets} 
\end{figure} 

\autoref{fig:corOffsets} shows the result of calculating a matrix of
correlations among the offsets calculated for each class based on
NLCD.  Each cell in the matrix reflects the value of the statistical
correlation between the corresponding classes within the NLCD offset
maps resulting from the algorithm described above.  This gives us an
overall sense of the effect of applying these offsets by showing which
changes are strongly correlated, whether it be positively or
negatively.  It is a symmetric matrix because the classes on both axes
are from the same data set and any single classes is, of course,
perfectly correlated with itself.  Going in we would expect to see
negative correlations between classes accepted as truth from NLCD,
water, wetland, and urban, and the other classes because the purpose
of applying these offsets was to bring the total areas of these
classes up, which can only happen at the expense of the other classes.
For example the wetland offsets show strong negative correlations with
forest, shrub, and mosaic.  This stands to reason as a likely problem
with classification due to fundamental differences in the remote
sensing data such as resolution, the interpretation thereof, or
disagreement/overlap in class definitions.  Many areas of forest
and shrub land can exhibit properties of a wetland when standing water
and high soil moisture are persistent.  The ``NLCD truth'' classes'
offsets are positively correlated with one another because they are
generally positive everywhere.  Likewise, non-truth classes are
positively correlated with one another because they are all being
assigned negative offsets to make room for the increased values of
water, wetland, and urban fractions.  The crop and mosaic classes are
most strongly negatively correlated with urban which reflects the
widespread adjustments to account for rural transportation networks
and smaller settlements.

The resulting offsets are added to the aggregated fractions calculated
from the MLCT with $A_{min}=0.5$.  The mosaic decomposition step is
readily applied to the adjusted data set because the adjusted
fractions are fundamentally in same form as the intermediate form of
the MLCT data calculated in \autoref{sec:decomposition}, only the
values have changed.


<<cusa_offset>>=

setwd( rasterWd)

                                        # reload offsets to get rid
                                        # of total layer
nlcdOffsets <- brick( paste( rasterWd, "nlcdOffsets.tif", sep="/"))
layerNames( nlcdOffsets) <- names( peelClasses)

mlctAdj <- list( Amin=0.5)
mlctAdj$agg <-
  if( overwriteRasters) {
    overlay( agg05, nlcdOffsets,
            fun= sum,
            filename= "agg05Adj.tif",
            overwrite= TRUE)
  } else brick( list.files( rasterWd,
                           patt= "agg05Adj.tif",
                           full.names= TRUE))

layerNames( mlctAdj$agg) <- names( peelClasses)

mlctAdj  <- decomposeMosaic( mlctAdj, overwrite= overwriteRasters, progress= "text")

@ 


<<areas2>>=

# reuse area table code from above; better to implement a function?

rasterNames2 <- c( "agland", "nlcd", "agg05", "nomos05",
                  "nlcdOffsets", "mlctAdj$agg", "mlctAdj$nomos")

dataSets2 <- sapply( rasterNames2,
  function( n) eval( parse( text=n)))

areas2 <- llply( dataSets2,
  function( d) {
    res <- cellStats( d *acres, sum)
    names( res) <- layerNames( d)
    res
  })

areasDf2 <- ldply( areas2, function( a) {
  melt( t( as.data.frame( a)))
})
areasDf2 <-
  areasDf2[, c( 1, 3, 4)]
colnames( areasDf2) <-
  c( "map", "class", "acres")

areasDf2 <-
  transform( areasDf2,
            class= factor( class,
              levels= c( names( peelLegend)[ rev( legendOrder)], "total")),
              ## c("crop", "open",
              ##   names( peelClasses)[-c(4,6,8)],
              ##   "mosaic", "total")),
            map= factor( map,
              levels= rasterNames2))

areasCt2 <- cast( areasDf2,
                 class ~ map,
                 subset= class != "total",
                 value= "acres",
                 sum,
                 margins="grand_row")

rownames( areasCt2) <- areasCt2[, "class"]
areasCt2 <- areasCt2[, -1]
areasCt2 <- areasCt2[ c( names( peelClasses), "(all)"), rasterNames2]

@ 


<<restack_check>>=

## check that everything balances
## output of decomposeMosaic is not brick()ed properly
## in the sense that the layer set is incomplete
## and out of order
  

restack <- function( peelBrick) {
  u <- unstack( peelBrick)
  names( u) <- layerNames( peelBrick)
  r <- do.call( stack,
          llply( names( peelClasses),
                function( cover) {
                  if( is.null( u[[ cover]]))
                    zeroes
                  else
                    u[[ cover]]
                }))
  layerNames( r) <- names( peelClasses)
  r
}
                                        # restack() takes any of the bricks/stacks from
                                        # previous functions and rearranges the layers
                                        # to match the PEEL classes, inserting layers of
                                        # zeroes as needed


restackOverlay <- function( rasterList, fun) {
  l <- llply( rasterList, restack)
  names( l) <- NULL
  do.call( overlay, c( l, fun=fun))
}
                                        # restackOverlay() runs its arguments through restack()
                                        # and applies a function to its outputs

@ 


@ 
<<table_restack_check, results=tex, eval=FALSE>>=

check <- restackOverlay( c( mlctAdj[ c("nomos", "delta")],
                           nlcdOffsets,
                           agg05),
                        function( n, d, o, a) n-d-o-a)
layerNames(check) <- names( peelClasses)

checkTable <-
  xtable( cbind( class=peelClasses,
                min=minValue( check),
                max=maxValue(check)),
         caption= "Balance of adjustment fractions and original MLCT aggregation", 
         label= "tab:restack_check")
digits( checkTable) <- c( 0, 0,-2,-2)
print( checkTable)
  
@ %def 

To assess whether the process of adding in the NLCD offsets has
improved overall cropland accuracy we can perform the same error
calculation from above and extend Table~\ref{tab:rmse} with the new
result, giving us Table~\ref{tab:rmse2}.

@ 
<<table_rmse2, results=tex, eval=TRUE>>=

                                        # add the RMSE for the new crop map
                                        # and an indication of the NLCD offsets' presence
  
rmseDf2 <-
  cbind( offset=c( TRUE, FALSE, FALSE),
        rbind( c( 0.5,
                 rmseRast( getPeelBand( mlctAdj$nomos, "crop"),
                          aglandCrop)),
              rmseDf))

##                                         # add the RMSE for the open class
## rmseDf2 <-
##   cbind( rmseDf2,
##         rmseOpen=ldply( list(mlctAdj$nomos, nomos05, nomos1),
##                 function( brickVar) {
##                   rmseRast( getPeelBand( brickVar, "open"),
##                            unstack( agland)[[ 2]])
##                 }))
## colnames(rmseDf2)[ c(3,4)] <- c( "$RMSE_{crop}$", "$RMSE_{open}$")

           
print( xtable( rmseDf2,
              caption= "RMSE, MLCT vs. Agland2000 crop with NLCD offsets",
              label= "tab:rmse2",
              digits= c( 0, 0, 1, 3)),
      include.rownames= FALSE,
      sanitize.colnames.function= function(x) x)

@ %def 

% \todo[caption=Should the RMSE tables be rearranged?]{Would it make
%   more sense to have the row order and independent variables (first
%   three) reversed in Table \ref{tab:rmse} and \ref{tab:rmse2}?}

% \todo[caption=Should references to the pasture/open data from
% Aglands2000 be removed?]{That data is not used in the analysis,
%   although it is interesting to see that it's RMSE declines as well.}

Seeing that this modification to the data set has improved our overall
accuracy of the distribution of croplands the next step is to examine
the total areas for all classes compared with the input data sets.  


<<tab_areas2, results=tex, eval=TRUE>>=


local({
  colnames( areasCt2) <- c( "Agland2000", "NLCD", "MLCT", 
                           "\\pbox[c][][c]{3in}{MLCT\\\\No Mosaic}",
                           "\\pbox[c][][c]{3in}{NLCD\\\\Offsets}", 
                           "\\pbox[c][][c]{3in}{MLCT\\\\Adjusted}",
                           "\\pbox[c][][c]{3in}{\\smallskip{}MLCT\\\\Adjusted\\\\No Mosaic}")
  print( xtable( areasCt2 / 10^6, 
                caption= "Effect of NLCD offsets on total acreages, $A_{min}=0.5$",
                label= "tab:areas2",
                digits= 1),
        size= "small",
        add.to.row= list( 
          pos= list( 0, nrow( areasCt)),
          command= rep("\\noalign{\\smallskip}", times= 2)),        
        sanitize.colnames.function= function(x) x)
})

if( overwriteFigures) areasPlotAdj <-
  qplot( map, acres /10^6,
        data= subset( areasDf2,
          class != "total" & map != "nlcdOffsets"),
        geom="bar", position= "stack",
        fill= class,
        stat="summary", fun.y="sum") +
  scale_fill_manual( "",
                    values= peelLegend[ legendOrder],
                    breaks= names( peelLegend)[ legendOrder]) +
  scale_y_continuous( "M acres",
      limits= c(0,2000)) +
  theme_bw( base_family= "serif") +
  scale_x_discrete( "",
      limits= rasterNames2[ rasterNames2 != "nlcdOffsets"],
      breaks= rasterNames2[ rasterNames2 != "nlcdOffsets"],
      labels= expression("Agland2000", "NLCD",
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==0.5)),
               phantom(0)),
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==0.5)),
               "No Mosaic"),
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==0.5)),
               "Adjusted"),
          atop( atop( textstyle( "MLCT"),
                     textstyle( A[ min] ==0.5)),
               scriptstyle( "Adjusted, No Mosaic"))))

cropScatAdjDf <- 
  data.frame( as( stack(getPeelBand( mlctAdj$nomos, "crop"),
                        aglandCrop,
                        raster::mask(acres, cusaMask)),
                 "SpatialGridDataFrame"))
colnames(cropScatAdjDf) <-
  c( "mlctAdj", "agland", "acres", "lon", "lat")
cropScatAdjDf$weight <- with( cropScatAdjDf, acres/ max(acres))

@


\begin{figure}[ht]
  \centering

<<fig_offsets>>=
 
if( overwriteFigures) {
  offsetsPlot <-
    qplot( class, acres /10^6, 
        data= subset( areasDf2,
          map == "nlcdOffsets" & class != "total"),
        geom= "bar",
        fill= class) +
    scale_fill_manual( "",
        values= peelLegend, 
        breaks= names( peelLegend)) +
    scale_y_continuous( "Ma", limits=c( -50, 80)) +
    scale_x_discrete( "", breaks= c( names( peelClasses), "total")) +
    coord_flip() +
    theme_bw( base_family= "serif") +
    opts( legend.position= "none")
  setwd( texWd)
  ggsave( "fig_offsets.pdf",
         device= pdf,
         plot= offsetsPlot,
         height= 4.5,
         width= 4.5)
}


@ %def 

  \includegraphics{fig_offsets}
  \caption{Total offsets calculated from NLCD}
  \label{fig:offsets}
\end{figure}


\autoref{fig:offsets} shows the totals by class of the offsets that
result from this calculation.  The item labeled ``total'' appears
blank because a value of zero is plotted there indicating that area
was conserved in this operation, which is to sat that area subtracted
from one class was reallocated to another.  As expected, the most
significant offset was for the urban class, representing the
low-density infrastructure outside of concentrations of development
large enough and dense enough to be identified in the MLCT
classification.  Water and wetland fractions were also increased to
bring the total areas of those classes in line with NLCD.  However,
the most important outcome with respect to our stated objective of
bringing total cropland areas in line with the total from Aglands2000
is the reduction of crop areas by
\Sexpr{printAreas(-areasCt2["crop","nlcdOffsets"])} and mosaic areas
by \Sexpr{printAreas(-areasCt2["mosaic","nlcdOffsets"])}.  This will
result in a total reduction of
\Sexpr{printAreas(-areasCt2["crop","nlcdOffsets"]-areasCt2["mosaic","nlcdOffsets"]/2)}
of the final crop class after mosaic decomposition because mosaic land
is taken to be half cropland by definition.  \autoref{fig:areasAdj}
shows the effect of adding these offsets and subsequently performing
the mosaic decomposition operation, which brings the cropland area for
the PEEL input data set into closer agreement with Aglands2000,
\Sexpr{printAreas(areasCt2["crop","mlctAdj$nomos"])} and
\Sexpr{printAreas(areasCt2["crop","agland"])} respectively. The
significance of this result is that it is in no way conditioned by the
desired cropland area estimate, rather shows a convergence in these
estimates by selectively incorporating information about other classes
from an independent data set, namely NLCD.

\begin{figure}[ht]
  \centering


@ 
<<fig_areasAdj>>=

if( overwriteFigures) {
  my.ggsave( texWd, "fig_areasAdj.pdf",
            device= pdf,
            plot= areasPlotAdj,
            width=  6,
            height= 6)
}

@ %def
  \includegraphics{fig_areasAdj}
  \caption{Total acreages after NLCD adjustment}
  \label{fig:areasAdj}
\end{figure}


\begin{figure}[ht] 
  \centering

<<fig_hexPlotAdj>>= 

if( overwriteFigures) {
  my.ggsave( texWd, "fig_hexPlotAdj.pdf",
         device= pdf,
         plot= hexPlot1 %+% cropScatAdjDf +
              aes( agland, mlctAdj) +
              scale_x_continuous( "Agland2000",
                  expand= c( 0,0.0125)) +
              scale_y_continuous( expression( paste( "MLCT Adjusted", phantom(A[min]))),
                  limits= c( 0, 1),
                  breaks= seq( 0, 1, by= 0.2),
                  expand= c( 0,0.0125)) +
              coord_equal())
}

@ 
    \includegraphics{fig_hexPlotAdj}
  \caption{Hexbin plot of MLCT adjusted crop versus Agland2000 cropland}
  \label{fig:hexPlotAdj} 
\end{figure} 


\section{Fusion of Adjusted MLCT and Agland2000}
\label{sec:fusion}

<<fusion>>=

## thumbAgland <- crop( agland,
##                     extent(-83.5, -(82+25/60), 42+55/60, 44+5/60),
##                     filename= "thumbAgland.tif",
##                     progress="text")

nomosCrop <- getPeelBand( mlctAdj$nomos, "crop")
aglandCrop <- unstack( agland)[[ 1]]

## nomosTruth is the sum of the classes from the NLCD offsets.
## other classes adjusted from now on cannot exceed $1 - nomosTruth

if( overwriteRasters) {
  nomosTruth <- overlay( getPeelBand( mlctAdj$nomos, "water"),
                        getPeelBand( mlctAdj$nomos, "wetland"),
                        getPeelBand( mlctAdj$nomos, "urban"),
                        fun= sum,
                        filename= "nomosTruth.tif",
                        overwrite= TRUE)
} else nomosTruth <-
  raster( list.files( dataPath,
                    patt="nomosTruth.tif",
                    full.names= TRUE)) 


nomosClasses <- layerNames( mlctAdj$nomos)[ -9]
                                        # leaves out 'total'
                                        # mosaic is already gone

## This is the overlay() bug.  These values of peelCrop should be the same
## but they're not.

## peelCrop <-
##   overlay( aglandCrop, nomosCrop, nomosTruth, fun=
##           function( a, n, t) {
##             ifelse( is.na( a), n, min( a, 1 -t))
##           },
##           filename= "peelCrop.tif",
##           overwrite= TRUE)

peelCrop <-
  if( overwriteRasters) {
    calc( stack( aglandCrop, nomosCrop, nomosTruth), fun=
         function( st) {
           a <- st[ 1]
           n <- st[ 2]
           t <- st[ 3]
           ifelse( is.na( a), n, min( a, 1 -t))
         },
         filename= "peelCrop.tif",
         overwrite= TRUE)
  } else raster( "peelCrop.tif")


offsetStack <-
  stack( llply( nomosClasses,
               function( class) {
                 if( class =="crop")
                   peelCrop
                 else
                   zeroes
               }))


noncropFactor <-
  overlay( peelCrop, nomosCrop, nomosTruth, fun=
          function( p, n, t) {
            ifelse( 1 -n -t <= 0,
                   0,
                   ( 1 -p -t) /( 1 -n -t))
          })


factorStack <- 
  stack( llply( nomosClasses,
               function( class) {
                 if( class == "crop")
                   zeroes
                 else if( class %in%
                         c( "water", "wetland", "urban"))
                   ones
                 else
                   noncropFactor
               }))

## stupid overlay bug!
##
## aglandComplete <-
##   if( overwriteRasters || TRUE) {
##     overlay( stack( unstack( mlctAdj$nomos)[ -9]),
##             factorStack,
##             offsetStack,
##             fun= function( x, m, b) m *x +b,
##             filename= "aglandComplete.tif",
##             overwrite= TRUE,
##             progress= "text")
##   } else brick( list.files( rasterWd,
##                          patt="^aglandComplete.tif$",
##                          full.names=TRUE))
## layerNames( aglandComplete) <- names(peelClasses)[-8]

aglandComplete <-
  stack( unstack( mlctAdj$nomos)[ -9]) *factorStack +offsetStack
aglandComplete <- writeRaster( aglandComplete,
                              "aglandComplete.tif",
                              overwrite= TRUE)
layerNames( aglandComplete) <- names(peelClasses)[-8]

aglandCompleteSum <- sum( aglandComplete)

## lt1AgcTotal <- extract( stack( peelCrop,
##                                nomosCrop,
##                                nomosTruth,
##                                noncropFactor,
##                                aglandComplete,
##                                aglandCompleteSum),
##                        which( aglandCompleteSum[] < 0.999))

## colnames( lt1AgcTotal)[ 1:4] <- c( "p", "n", "t", "factor")
## lt1AgcTotal <- data.frame( lt1AgcTotal)
## lt1AgcTotal <- within( lt1AgcTotal, { term1 <- n-p; term2 <- 1-n-t})

## within( head( lt1AgcShrub), { 


## normalize to fix 65 pixels missing area
aglandComplete <- aglandComplete /aglandCompleteSum
layerNames( aglandComplete) <- names(peelClasses)[-8]

agcMap <- coverMaps( aglandComplete, 0.4,
                    classes= layerNames( aglandComplete)[1:4]) +
  coord_equal() +
  facet_grid( variable ~ .)

agcMap2 <- coverMaps( aglandComplete, 0.4,
                    classes= layerNames( aglandComplete)[5:8]) +
  coord_equal() +
  facet_grid( variable ~ .)
                                                                                       

@ 


By bringing the aggregate crop areas of our MLCT-derived data set into
greater agreement with Agland2000 and accepting the water, wetland,
and urban fractions indicated by the NLCD we are now ready for the
final manipulation of the remaining classes, forest, shrub, open, and
barren, that will bring our complete data set into maximum agreement
with Agland2000.  To do this we will be setting the crop fraction
equal to Agland2000's crop value everywhere that this does not
conflict with the allocation indicated by the NLCD offsets.  Anywhere
that there is a conflict such that Agland2000 indicated more cropland
than allowed by the NLCD offsets the crop fraction will be set to one
minus the total of the offsets and all other classes will be set to
zero.  Otherwise the non-offset, non-crop classes will be scaled to
fit proportionally into the remaining area left after incorporating
those classes which we have given primacy.  The assumption in this step
is that the census data behind the Agland2000 crop map is a ``ground
truth'' and Ramankutty's method for allocating that area within the
$5'$ grid is sufficiently faithful to that truth.  The one difficulty
of note in this step is the presence of cells where we have
MLCT/NLCD-derived fractions for LULC but Agland2000 is null, that is
to say that no data is given indicating that those cells were not
included in the land mass.  In those cases we accept the crop fraction
previously calculated since there is nothing to compare it against.


\begin{figure}[ht] 
  \centering

<<fig_agc>>= 

my.ggsave( texWd, "fig_agc.png",
          plot= agcMap, width=4.5, height=8)
@ 

\includegraphics{fig_agc}
\caption{Final PEEL maps} 
\label{fig:agc} 
\end{figure} 

\begin{figure}[ht] 
  \centering

<<fig_agc2>>= 

my.ggsave( texWd, "fig_agc2.png",
          plot= agcMap2, width=4.5, height=8)
@ 

\includegraphics{fig_agc2}
\caption{Final PEEL cover maps (cont.)} 
\label{fig:agc2} 
\end{figure} 


@


@ 
<<table_rmse3, results=tex, eval=TRUE>>=

setwd( rasterWd)  

rmseDf3 <- 
  cbind( agland=c( TRUE, rep(FALSE, times=3)),
        rbind( c( TRUE, 0.5,
                 rmseRast( getPeelBand( aglandComplete, "crop"),
                          aglandCrop)),
              rmseDf2))
rmseDf3 <- within(rmseDf3, offset <- as.logical( offset))
                                        # had to change offset column back
                                        # to true/false;  maybe this can be
                                        # avoided with list() instead of c()

rmseXt <- xtable( rmseDf3,
                 caption= "RMSE of PEEL vs. Agland2000",
                 label= "tab:rmse3",
                 digits= c( 0, 0, 0, 1, 3))
                                        # looks like some kind of bug in xtable()
                                        # manual correction:
rmseXt$agland <- rmseDf3$agland
rmseXt$offset <- rmseDf3$offset

print( rmseXt,
      include.rownames= FALSE,
      sanitize.colnames.function= function(x) x)

@ %def 

<<tab_areas3, results=tex, eval=TRUE>>=

areasCt3 <- acreageTable( c( rasterNames2[ c( 1, 2, 4, 7)], "aglandComplete"))

local({
  colnames( areasCt3) <-
    c( "Agland2000", "NLCD",
      "\\pbox[c][][c]{3in}{MLCT\\\\No Mosaic}",
      "\\pbox[c][][c]{3in}{\\smallskip{}MLCT\\\\Adjusted\\\\No Mosaic}",
      "PEEL")
  print( xtable( areasCt3 / 10^6, 
                caption= "PEEL acreages, $A_{min}=0.5$",
                label= "tab:areas3",
                digits= 1),
        size= "small",
        add.to.row= list( 
          pos= list( 0, nrow( areasCt)),
          command= rep("\\noalign{\\smallskip}", times= 2)),        
        sanitize.colnames.function= function(x) x)
  ##,
  ##      floating= FALSE)
})

cropScatAgcDf <- 
  data.frame( as( stack(getPeelBand( aglandComplete, "crop"),
                        aglandCrop,
                        raster::mask(acres, cusaMask)),
                 "SpatialGridDataFrame"))
colnames(cropScatAgcDf) <-
  c( "agc", "agland", "acres", "lon", "lat")
cropScatAgcDf$weight <- with( cropScatAgcDf, acres/ max(acres))


cropScatAgcDf <-
  within( cropScatAgcDf,
         { cat <- NA
           cat[    agc == 0] <- 0
           cat[ agc > 0 & is.na( agland)] <- 1
           cat[ agc > 0 & ( agc -agland) < 0.1] <- 2
           cat[ agc > 0 & agc  < agland] <- 3
           cat[ agland == 1] <- 4
         })

agcThemeMap <-
  ggplot( cropScatAgcDf[ !is.na( cropScatAgcDf$cat),],
         aes( x= lon, y= lat)) +
  geom_tile( aes( fill= factor( cat))) +
  scale_fill_brewer( "",
                    breaks= 0:4,
                    labels= c(
                      "PEEL = 0",
                      "PEEL > 0, Ag2k is null",
                      "PEEL > 0, PEEL = Ag2k",
                      "PEEL > 0, PEEL < Ag2k",
                      "Ag2k = 1")) +
  coord_equal() +
  theme_map


@

\todo{Discuss implications of final area tabulation}
 

\begin{figure}[ht] 
  \centering

<<fig_hexPlotAgc>>= 

if( overwriteFigures) {
  setwd( texWd)
  ggsave( "fig_hexPlotAgc.pdf",
         device= pdf,
         plot= hexPlot1 %+% cropScatAgcDf +
              aes( agland, agc) +
           scale_fill_gradientn( colours= brewer.pal( 6, "YlGn"),
                       trans= "log10",
                       limits=c( 10, 20000)) +
              scale_x_continuous( "Agland2000") +
              scale_y_continuous( "PEEL",
                  limits= c( 0, 1),
                  breaks= seq( 0, 1, by= 0.2)) +
              coord_equal(),
         height= 4.5,
         width= 4.5)
}

@ 
    \includegraphics{fig_hexPlotAgc}
  \caption{Hexbin plot of PEEL crop versus Agland2000 crop}
  \label{fig:hexPlotAgc} 
\end{figure} 


\begin{sidewaysfigure}[ht] 
  \centering

<<fig_agcThemeMap>>= 

if( overwriteFigures) {
  my.ggsave( texWd, "fig_agcThemeMap.png", width=7.5,
            plot= agcThemeMap,
            bg= "transparent")
}

@ 

    \includegraphics{fig_agcThemeMap}
  \caption{Conflicts between NLCD offsets and Agland2000}
  \label{fig:agcThemeMap} 
\end{sidewaysfigure} 


\autoref{fig:agcThemeMap} shows a thematic map that classifies the
cells in our study area according to their agreement on the cropland
fraction between Ramankutty's Agland2000 data set and the newly
created PEEL data set.  The first class indicated by ``PEEL = 0'' in
the legend represents where Agland2000 cropland fraction is zero so
there is no potential for conflict.  The second class shows where the
PEEL crop fraction is greater than zero but Agland2000 is null,
meaning no data was given for those cells.  Such cells generally occur
in coastal areas and on the shores of the Great Lakes, reflecting that
Rmankutty's criteria for counting a cell as ``dry land'' was somehow
more restrictive.  The third class shows where the NLCD ``truth''
classes (water, wetland, urban) allowed us to bring the PEEL crop
fraction in line with Agland2000 without violating the assumption that
those fractions should be carried over from the NLCD aggregation.  The
fourth class reveals where those constraints could not be
simultaneously satisfied.  Those cells correspond to the bins in
\autoref{fig:hexPlotAgc} that fall below the equality line because the
values from the NLCD offsets are given precedence and the crop
fraction is limited accordingly, which of course might mean that other
non-offset, non-crop classes could be summarily reduced to zero.  The
final class highlights pixels that Agland2000 assigns a crop fraction
of 1.0 which seems unrealistic given that some infrastructure and
uncultivated cover must be present within such large areas.

\section{Disaggregation of PEEL Crop Fractions According to 175Crops2000}
\label{sec:peel}


<<crop_cats>>=

setwd( rasterWd)

cropCats <-
  c("cereals", "field_crop", "forage", "maize",
    "rice", "shrub_crop", "soybean", "sugarcane",
    "tree_crop", "wheat")
names( cropCats) <- cropCats


cropCatsPeel <-
  list( crop= c( "cereals", "field_crop", "forage",
                "maize", "rice", "soybean",
                "sugarcane", "wheat"),
       open= NULL,
       shrub= "shrub_crop",
       forest= "tree_crop")
                     
cropCats <- llply(  cropCats, function(c) {
  raster( paste( dataPath,
                paste( c, "tif", sep="."),
                sep= "/"))
})

cropStack <- stack( cropCats)


cropSum <- overlay( stack( cropCats[ cropCatsPeel$crop]),
                   fun= sum)

cropSum[ is.na( cropSum[])
        & !is.na( getPeelBand( aglandComplete,
                              "crop")[])
        ] <- 0
cropSum <- writeRaster( cropSum,
                   file= "cropSum.tif",
                   overwrite= TRUE)

cropNormalFunc <- function( st) {
  cropCat <- st[ 1]
  cropSum <- st[ 2]
  agc <-     st[ 3]
  ifelse( cropSum == 0,
         0,
         cropCat / cropSum)
}

cropNormal <- stack( llply( cropCats[ cropCatsPeel$crop],
                           function( crop) {
                             calc( stack( crop,
                                         cropSum,
                                         getPeelBand( aglandComplete, "crop")),
                                  fun= cropNormalFunc)
                           }))

cropSubClasses <- getPeelBand( aglandComplete, "crop") *cropNormal
layerNames( cropSubClasses) <- cropCatsPeel$crop


@ 


\begin{figure}[ht] 
  \centering

<<fig_cropSubClassesMap>>= 

if( overwriteFigures) {
  cropSubClassesMap <-
    coverMaps( cropSubClasses, 0.4,
              classes= layerNames(cropSubClasses)[ 1:4]) +
    coord_equal() +
    facet_grid( variable ~ .)
  my.ggsave( texWd, "fig_cropSubClassesMap.png",
            plot= cropSubClassesMap)
}

@ 
    \includegraphics{fig_cropSubClassesMap}
  \caption{Normalized fractions for crop sub-classes}
  \label{fig:cropSubClassesMap} 
\end{figure} 

\begin{figure}[ht] 
  \centering

<<fig_cropSubClassesMap2>>= 

if( overwriteFigures) {
  cropSubClassesMap2 <-
    coverMaps( cropSubClasses, 0.4,
              classes= layerNames(cropSubClasses)[ 5:7]) +
    coord_equal() +
    facet_grid( variable ~ .)
  my.ggsave( texWd, "fig_cropSubClassesMap2.png",
            plot= cropSubClassesMap2)
}

@ 
    \includegraphics{fig_cropSubClassesMap2}
  \caption{Normalized fractions for crop sub-classes (cont.)}
  \label{fig:cropSubClassesMap2} 
\end{figure} 


We could assume that forage crops come from open class but we don't
know enough about the confusion between Aglands2000 pasture and the
open class in the first place, much less to make an informed
speculation about how forage crops would be classified by MLCT.  The
focus here is field crops so that is the only class that we are
attempting to disaggregate and forage crops are included there for
now.  Tree and shrub crops could be taken from the corresponding cover
types, but assuming that they are caught up in that classification is
a blind leap and their areas are small.  On the other hand, their
economic impact may be disproportionate to their areas by virtue of
price, but this will have to be studied more carefully.

Double-cropping is ignored for now by normalizing the crop fractions
by the sum of all crops, which can exceed unity in instances of
intense double-cropping.  The predominant double-cropping system in
the cUSA to our knowledge is soy followed by winter wheat, but there
may be others such as multiple cropping of rice in the southern
extremes of its range.  In areas where soy and wheat are
double-cropped their areas will be underestimated in this data set
relative to that given in the 175Crops2000 data set, subsequent to the
NLCD offset adjustment.  This issue also bears further study.


@ 
<<restack_crops>>=
setwd( rasterWd)

check <- 
  getPeelBand( aglandComplete, "crop") -
  sum( cropSubClasses)

mlctCrop <-
  overlay( getPeelBand( aglandComplete, "crop"),
	  cropSum,
        fun= function( agc, ag) {
          ifelse( is.na( ag), agc, 0)
        })

check <- 
  getPeelBand( aglandComplete, "crop") -
  sum( cropSubClasses) -
  mlctCrop


peelData <- stack( aglandComplete, cropSubClasses, mlctCrop, check)
layerNames( peelData)[ 16:17] <- c("other_crop", "check")

peelDf <- data.frame(peelData[])

noisyCells <- rownames( with( peelDf,
    peelDf[ !is.na( check) & check >= 0.001,]))
# 325 cells with noise above this threshold

mlctCrop <- mlctCrop + check

peelData <- stack( aglandComplete, cropSubClasses, mlctCrop)
layerNames( peelData)[ 16] <- c("mlct_crop")

## peelData <- writeRaster( peelData,
##                         filename= sprintf( "%s/peel.tif", rasterWd),
##                         overwrite= TRUE)

peelData <- brick( peelData,
                  filename= sprintf( "%s/peel.tif", rasterWd),
                  overwrite= TRUE)
                           
layerNames( peelData) <-
    c( unlist( llply( c( aglandComplete,
                        cropSubClasses),
                     layerNames)),
      "mlct_crop")

                                        # raster's native method for writing out
                                        # a .nc file doesn't create variables
                                        # from named layers, so we have to roll
                                        # our own.
                           
## peelCdf <- brick( peelData, filename= "peel.nc", overwrite= TRUE,
##                  varname= layerNames( peelData),
##                  varunit= "" )

if( require( ncdf4)) {                             
  cdfVars <- function( r) {
    halfCell <- res( r) /2
    cdfDimVars <- {
      x <- ncdim_def( "longitude", "degrees_east",
                     vals= seq(
                       from= xmin( r) +halfCell[ 1],
                       to=   xmax( r) -halfCell[ 1],
                       by= res(r)[ 1]))
      y <- ncdim_def( "latitude", "degrees_north",
                     vals= seq(
                       from= ymax( r) -halfCell[ 2],
                       to=   ymin( r) +halfCell[ 2],
                       by= -res(r)[ 2]))
      list( x, y)
    }
    cdfVarFun <- function( layerName) {
      ncvar_def( layerName, units= "", cdfDimVars, missval= -1)
    }
    sapply( layerNames( r), cdfVarFun, simplify= FALSE)
  }
  peelCdf <- nc_create( "peel0.nc",
                       c( list( crs= ncvar_def( "crs", "",
                                  list(), prec= "integer")),
                         cdfVars( peelData)))
  ncatt_put( peelCdf, 0, "Conventions", "CF-1.6")
  ncatt_put( peelCdf, 0, "title",
            "PEEL0 LULC dataset")
  ncatt_put( peelCdf, 0, "institution",
            "Computation Institute, University of Chicago")
  ncatt_put( peelCdf, 0, "source",
            "Best, N. (2011), \"Synthesis of a complete land use/land cover data set for the conterminous United States emphasizing accuracy in area and distribution of agricultural activity\", Master’s thesis, Northeastern Illinois University.")
  ncatt_put( peelCdf, 0, "references", 
            "Best et al. (forthcoming), \"Synthesis of a complete land use/land cover dataset for the conterminous United States\", RDCEP Working Paper Series and elsewhere")
  ncatt_put( peelCdf, 0, "history", date())
  ## ncatt_put( peelCdf, 0, "spatial_ref", showWKT( projection( peelData)))

  ncatt_put( peelCdf, "crs", "grid_mapping_name", "latitude_longitude")
  ncatt_put( peelCdf, "crs", "longitude_of_prime_meridian", 0.0)
  ncatt_put( peelCdf, "crs", "semi_major_axis", 6378137.0)
  ncatt_put( peelCdf, "crs", "inverse_flattening", 298.257223563)

  for( layerName in layerNames( peelData)) {
    ncatt_put( peelCdf, layerName, "grid_mapping", "crs")
    ## ncatt_put( peelCdf, layerName, "spatial_ref",
    ##           showWKT( projection( peelData)))
    ncvar_put( peelCdf, layerName, subset( peelData, layerName, drop= TRUE)[],
              start= c( 1, 1),
              count= c( -1, -1))
  }

  nc_close( peelCdf)

  
## peelDf <- data.frame( peelData[ !is.na( grid[])])
## rownames( peelDf) <- grid[][ !is.na(grid[])]

peelDf <- data.frame( cell= grid[], peelData[])[ !is.na( grid[]), ]
peelDf$cell <- as.character( peelDf$cell)
                           
                           
write.csv( format.df( peelDf,
                     dec=3,
                     numeric.dollar=FALSE,
                     na.blank= TRUE),
          row.names= FALSE,
          file= "peel.csv",
          quote= FALSE)

## copy to data archive
## file.copy( "peel.csv", "~/see/data/cimdb/peel_thesis.csv",
##           overwrite=TRUE)


## peelData <- brick( aglandComplete, cropSubClasses)

@ %def 

@ 
<<cleanup, eval=TRUE>>=
options( prompt= "> ", continue= "+ ", width= 80)
@ %def 

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: "thesis"
%%% End: