diff --git a/NAMESPACE b/NAMESPACE
index ec61b52..b3a00fc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(agprodchange_interp)
 export(agprodchange_ref)
+export(clean_sage)
 export(clean_yield)
 export(climate_impact)
 export(colname_replace)
@@ -35,6 +36,7 @@ export(plot_yield_impact)
 export(prep_regression)
 export(regression_fixed_effects)
 export(smooth_impacts)
+export(verify_crop)
 export(weather_agg)
 export(weather_clean)
 export(weighted_climate)
diff --git a/R/crop_calendars.R b/R/crop_calendars.R
index c4aaf0d..3c9af55 100644
--- a/R/crop_calendars.R
+++ b/R/crop_calendars.R
@@ -3,45 +3,173 @@
 #' Generate planting months for each country
 #' Data from SAGE
 #'
+#' @param crop_calendar_file Default = NULL. String for the path of the crop calendar file. If crop_cal is provided, crop_select will be set to crops in crop calendar. User provided crop_calendar_file can include any crops MIRCA2000 crops: "wheat", "maize", "rice", "barley", "rye", "millet", "sorghum", "soybean", "sunflower", "root_tuber", "cassava", "sugarcane", "sugarbeet", "oil_palm", "rape_seed", "groundnuts", "pulses", "citrus", "date_palm", "grapes", "cotton", "cocoa", "coffee", "others_perennial", "fodder_grasses", "others_annual"
+#' @param crop_select Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"
 #' @param output_dir Default = file.path(getwd(), 'output'). String for output directory
 #' @returns A data frame of crop calendars
 #' @export
 
-crop_calendars <- function(output_dir = file.path(getwd(), "output")) {
+crop_calendars <- function(crop_calendar_file = NULL,
+                           crop_select = NULL,
+                           output_dir = file.path(getwd(), "output")) {
   iso <- crop <- NULL
 
   message("Starting Step: crop_calendars")
 
-  # load SAGE data
-  d <- sage
-
-  # format, subset data and replace location name
-  d$Location <- ifelse(d$Location == "Georgia" & d$Source == "USDA UPHD", "Georgia_USA", d$Location)
-  d <- subset(d, select = c("Data.ID", "Location", "Crop", "Qualifier", "Plant.start", "Plant.end", "Harvest.start", "Harvest.end"))
-  d <- gaia::colname_replace(d, "Location", "country_name")
-  d <- gaia::colname_replace(d, "Crop", "crop")
-  d$crop <- tolower(d$crop)
-
-  # Average plant and harvest month
-  d$plant <- ifelse(d$Plant.start < d$Plant.end, ((d$Plant.start + d$Plant.end) / 2 / 30), ((d$Plant.start + d$Plant.end + 365) / 2 / 30))
-  d$plant <- ceiling(ifelse(d$plant > 12, d$plant - 12, d$plant))
-  d$plant <- ifelse(d$crop == "sugarcane", 1, d$plant) # Sugarcane is a multiyear crop, will assume that 12 months preceding annual value equals growing season.
-  d$harvest <- ifelse(d$Harvest.start < d$Harvest.end, ((d$Harvest.start + d$Harvest.end) / 2) / 30, ((d$Harvest.start + d$Harvest.end + 365) / 2) / 30)
-  d$harvest <- ceiling(ifelse(d$harvest > 12, d$harvest - 12, d$harvest))
-  d$harvest <- ifelse(d$crop == "sugarcane", 12, d$harvest) # Sugarcane is a multiyear crop, will assume that 12 months preceding annual value equals growing season.
-
-  # Country codes
-  d <- merge(d, mapping_gcam_iso, by = "country_name", all.x = TRUE)
-  d <- gaia::iso_replace(d)
-
-  # Indicators for crops
-  crops <- mapping_mirca_sage
-
-  for (i in 1:nrow(crops)) {
-    d <- d %>%
-      dplyr::mutate(!!paste0(crops$crop_mirca[i]) := ifelse(!is.na(iso) & (crop == crops$crop_sage[i]), 1, 0))
+  # verify the crops provided or selected are within the MIRCA crops
+  crops <- gaia::verify_crop(
+    crop_calendar_file = crop_calendar_file,
+    crop_select = crop_select)
+
+  # use user data if provided
+  if(!is.null(crop_calendar_file)){
+
+    d <- data.table::fread(
+      file = crop_calendar_file,
+      skip = 0,
+      stringsAsFactors = FALSE,
+      header = TRUE
+    )
+
+  } else {
+
+    # load SAGE data
+    d <- sage
+
+    # format, subset data and replace location name
+    d$Location <- ifelse(d$Location == "Georgia" & d$Source == "USDA UPHD", "Georgia_USA", d$Location)
+    d <- subset(d, select = c("Data.ID", "Location", "Crop", "Qualifier", "Plant.start", "Plant.end", "Harvest.start", "Harvest.end"))
+    d <- gaia::colname_replace(d, "Location", "country_name")
+    d <- gaia::colname_replace(d, "Crop", "crop")
+    d$crop <- tolower(d$crop)
+
+    # Average plant and harvest month
+    d$plant <- ifelse(d$Plant.start < d$Plant.end, ((d$Plant.start + d$Plant.end) / 2 / 30), ((d$Plant.start + d$Plant.end + 365) / 2 / 30))
+    d$plant <- ceiling(ifelse(d$plant > 12, d$plant - 12, d$plant))
+    d$plant <- ifelse(d$crop == "sugarcane", 1, d$plant) # Sugarcane is a multiyear crop, will assume that 12 months preceding annual value equals growing season.
+    d$harvest <- ifelse(d$Harvest.start < d$Harvest.end, ((d$Harvest.start + d$Harvest.end) / 2) / 30, ((d$Harvest.start + d$Harvest.end + 365) / 2) / 30)
+    d$harvest <- ceiling(ifelse(d$harvest > 12, d$harvest - 12, d$harvest))
+    d$harvest <- ifelse(d$crop == "sugarcane", 12, d$harvest) # Sugarcane is a multiyear crop, will assume that 12 months preceding annual value equals growing season.
+
+    # Country codes
+    d <- merge(d, mapping_gcam_iso, by = "country_name", all.x = TRUE)
+    d <- gaia::iso_replace(d)
+
+    for (i in 1:nrow(crops)) {
+      d <- d %>%
+        dplyr::mutate(!!paste0(crops$crop_mirca[i]) := ifelse(!is.na(iso) & (crop == crops$crop_sage[i]), 1, 0))
+    }
+
+    # clean up crop calendar
+    d <- gaia::clean_sage(d = d)
+
+  }
+
+  # filter the crops selected
+  d.crop.cal <- subset(d, select = c("iso", crops$crop_mirca, "plant", "harvest"))
+  d.crop.cal <- subset(d.crop.cal, !is.na(iso))
+  d <- NULL
+
+  # save crop calendar data
+  save_dir <- file.path(output_dir, "data_processed")
+  if (!dir.exists(save_dir)) {
+    dir.create(save_dir, recursive = TRUE)
+  }
+  utils::write.csv(d.crop.cal, file = file.path(save_dir, "crop_calendar.csv"))
+
+  return(d.crop.cal)
+}
+
+
+# ------------------------------------------------------------------------------
+#' verify_crop
+#'
+#' Identify crops selected by users are available crop types from both SAGE and MIRCA
+#'
+#' @param crop_calendar_file Default = NULL. String for the path of the crop calendar file. If crop_cal is provided, crop_select will be set to crops in crop calendar. User provided crop_calendar_file can include any crops MIRCA2000 crops: "wheat", "maize", "rice", "barley", "rye", "millet", "sorghum", "soybean", "sunflower", "root_tuber", "cassava", "sugarcane", "sugarbeet", "oil_palm", "rape_seed", "groundnuts", "pulses", "citrus", "date_palm", "grapes", "cotton", "cocoa", "coffee", "others_perennial", "fodder_grasses", "other_annual"
+#' @param crop_select Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"
+#' @returns A data frame of crop name mapping between MIRCA and SAGE
+#' @keywords internal
+#' @export
+
+verify_crop <- function(crop_calendar_file = NULL,
+                        crop_select = c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat")){
+
+
+  # Step 1: Finalize crop_select
+  # read the user provided crop calendar
+  if(!is.null(crop_calendar_file)){
+
+    df_crop_cal <- data.table::fread(
+      file = crop_calendar_file,
+      skip = 0,
+      stringsAsFactors = FALSE,
+      header = TRUE
+    )
+
+    # check if user provided crop_select
+    if(!is.null(crop_select)){
+
+      warning('Replace crop_select with crops from user provided crop_calendar_file')
+
+      # Use crops from the crop_calendar_file
+      crop_select <- names(crop_cal)[!names(crop_cal) %in% c('iso', 'plant', 'harvest')]
+
+    }
+  } else {
+
+    # if neither crop_calendar_file nor crop_select is provided, use default crop selections
+    if(is.null(crop_select)){
+      crop_select <- mapping_mirca_sage$crop_mirca
+    } else {
+      crop_select <- unique(c(crop_select, mapping_mirca_sage$crop_mirca))
+
+    }
+
   }
 
+  # Step 2: Check crop_select in MIRCA
+
+  # if crop_calendar_file is provided, we only need to check whether those crops exist in MIRCA,
+  # if crop_select is provided based on the given list, then the selection will be within both MIRCA and SAGE datasets
+  crop_select_mirca <- crop_mirca$crop_name[crop_mirca$crop_name %in% crop_select]
+
+  if(length(crop_select_mirca) < length(crop_select)){
+
+    crop_not_mirca <- crop_select[is.na(match(crop_select, crop_mirca$crop_name))]
+
+    warning(paste0(paste(crop_not_mirca, collapse = ', '), ' are not MIRCA crops. Remove from crop calendar calculation.'))
+
+  }
+
+  # Step 3: make the mapping between MIRCA and SAGE crops
+  crop_select_sage <- crop_mirca$crop_sage[match(crop_select_mirca, crop_mirca$crop_name)]
+
+  # create mapping
+  mapping_crop <- tibble::tibble(
+    crop_mirca = crop_select_mirca,
+    crop_sage = crop_select_sage
+  )
+
+  # print the selected crops
+  print(paste0('Final selected crops are: ', paste(crop_select_mirca, collapse = ', ')))
+
+  return(mapping_crop)
+}
+
+
+# ------------------------------------------------------------------------------
+#' clean_sage
+#'
+#' Clean up the sage crop calendar by reassigning the iso name and
+#' make sure to keep one set of the plant and harvest for each country and crop
+#'
+#' @param d Default = NULL. data frame for crop calendar
+#' @returns A data frame of clean crop calendar
+#' @keywords internal
+#' @export
+
+clean_sage <- function(d = NULL){
 
   # ----------------------------------------------------------------------------
   # Deal with exceptions (countries by states or regions)
@@ -271,9 +399,11 @@ crop_calendars <- function(output_dir = file.path(getwd(), "output")) {
 
   # Somalia (two seasons, use the one with no qualifier code)
   d$cotton <- ifelse(d$country_name == "Somalia" & d$crop == "cotton" & d$Qualifier == 2, 0, d$cotton)
+  d$millet <- ifelse(d$country_name == "Somalia" & d$crop == "millet" & d$Qualifier == 2, 0, d$millet) #MZ
 
   # Kenya (two obs, nearly the same)
   d$cotton <- ifelse(d$country_name == "Kenya" & d$crop == "cotton" & d$Data.ID == 1711, 0, d$cotton)
+  d$millet <- ifelse(d$country_name == "Kenya" & d$crop == "millet" & d$Qualifier == 2, 0, d$millet) #MZ
 
   # SORGHUM CORRECTIONS
   # Dominican Republic (iso ID -- dom) listed thrice for sorghum
@@ -311,48 +441,68 @@ crop_calendars <- function(output_dir = file.path(getwd(), "output")) {
   d$iso <- ifelse((d$country_name == "North Korea (central bowl)" & d$crop == "soybeans"), "prk", d$iso)
   d$soybean <- ifelse(d$country_name == "North Korea (central bowl)" & d$crop == "soybeans", 1, d$soybean)
 
-  # # PULSES CORRECTIONS
-  # # Duplicate listing for Burundi (iso ID -- bdi)
-  # # Seasonal production of pulses in Burundi: http://www.fao.org/docrep/004/w5956e/w5956e00.htm
-  # d$iso <- ifelse( ( d$country_name == "Burundi" & d$crop == "pulses" & d$Data.ID == 83 ), "bdi", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Burundi" & d$crop == "pulses" & d$Data.ID == 83, 1, d$pulses )
-  # # Duplicate listing for Ethiopia (iso ID -- eth)
-  # # Ethiopia pulse production by month: http://www.fao.org/3/a-at305e.pdf
-  # d$iso <- ifelse( ( d$country_name == "Ethiopia" & d$crop == "pulses" & d$Data.ID == 1695 ), "eth", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Ethiopia" & d$crop == "pulses" & d$Data.ID == 1695, 1, d$pulses )
-  # # Duplicate listing for Haiti (iso ID -- hti)
-  # # Haiti seasonality for pulse production: https://www.worldpulse.com/fr/node/9391
-  # d$iso <- ifelse( ( d$country_name == "Haiti" & d$crop == "pulses" & d$Data.ID == 275 ), "hti", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Haiti" & d$crop == "pulses" & d$Data.ID == 275, 1, d$pulses )
-  # # Duplicate listing for Kenya (iso ID -- ken)
-  # # Case study of Kenya pulse production: https://www.investinkenya.co.ke/main/view_article/330
-  # d$iso <- ifelse( ( d$country_name == "Kenya" & d$crop == "pulses" & d$Data.ID == 341 ), "ken", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Kenya" & d$crop == "pulses" & d$Data.ID == 341, 1, d$pulses )
-  # # Duplicate listing for Nicaragua (iso ID -- nic)
-  # # Pulse production by month in Nicaragua: http://www.fas.usda.gov/regions/nicaragua
-  # d$iso <- ifelse( ( d$country_name == "Nicaragua" & d$crop == "pulses" & d$Data.ID == 446 ), "nic", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Nicaragua" & d$crop == "pulses" & d$Data.ID == 446, 1, d$pulses )
-  # # Duplicate listing for Rwanda (iso ID -- rwa)
-  # # Southern hemisphere; Rwanda pulse seasonality: http://allafrica.com/stories/201410090778.html
-  # d$iso <- ifelse( ( d$country_name == "Rwanda" & d$crop == "pulses" & d$Data.ID == 504 ), "rwa", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Rwanda" & d$crop == "pulses" & d$Data.ID == 504, 1, d$pulses )
-  # # Duplicate listing for El Salvador (iso ID -- slv)
-  # # Pulse study in Central America: https://www.goift.com/market/americas/central-america/el-salvador-central-america/page/7/
-  # d$iso <- ifelse( ( d$country_name == "El Salvador" & d$crop == "pulses" & d$Data.ID == 211 ), "slv", d$iso )
-  # d$pulses <- ifelse( d$country_name == "El Salvador" & d$crop == "pulses" & d$Data.ID == 211, 1, d$pulses )
-  # # Duplicate listing for Somalia (iso ID -- som)
-  # # Seasonality of pulse production in Somalia: https://conservancy.umn.edu/bitstream/handle/11299/163161/JohnsonBrittney.pdf?sequence=1&isAllowed=y
-  # d$iso <- ifelse( ( d$country_name == "Somalia" & d$crop == "pulses" & d$Data.ID == 1747 ), "som", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Somalia" & d$crop == "pulses" & d$Data.ID == 1747, 1, d$pulses )
-  # # California has the largest pulse production of all states
-  # d$iso <- ifelse( ( d$country_name == "California" & d$crop == "pulses" & d$Data.ID == 1007 ), "usa", d$iso )
-  # d$pulses <- ifelse( d$country_name == "California" & d$crop == "pulses" & d$Data.ID == 1007, 1, d$pulses )
-  # # Maharashtra, of all Indian states listed (Gujarat, Himachal Pradesh, Kerala, Orissa, Rajasthan, Tamil Nadu), has 34% share of pulse production
-  # d$iso <- ifelse( ( d$country_name == "Maharashtra" & d$crop == "pulses" & d$Data.ID == 1424 ), "ind", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Maharashtra" & d$crop == "pulses" & d$Data.ID == 1424, 1, d$pulses )
-  # # Uganda (South) should be associate with iso ID == uga and Data.ID == 611
-  # d$iso <- ifelse( ( d$country_name == "Uganda (South)" & d$crop == "pulses" & d$Data.ID == 611 ), "uga", d$iso )
-  # d$pulses <- ifelse( d$country_name == "Uganda (South)" & d$crop == "pulses" & d$Data.ID == 611, 1, d$pulses )
+  # Barley Corrections (MZ)
+  d$barley <- ifelse(d$country_name == "Kenya" & d$crop == "barley" &  d$Data.ID == 340, 0, d$barley)
+  d$barley <- ifelse(d$country_name == "United Kingdom" & d$crop == "barley" &  d$Data.ID == 787, 0, d$barley)
+  d$barley <- ifelse(d$country_name == "Germany" & d$crop == "barley" &  d$Data.ID == 713, 0, d$barley)
+
+  # Groundnuts Corrections (MZ)
+  d$groundnuts <- ifelse(d$country_name == "Nigeria" & d$crop == "groundnuts" &  d$Data.ID == 1730, 0, d$groundnuts)
+  d$groundnuts <- ifelse(d$country_name == "Somalia" & d$crop == "groundnuts" &  d$Data.ID == 1743, 0, d$groundnuts)
+
+  # Rapeseed Corrections (MZ)
+  d$rape_seed <- ifelse(d$country_name == "Ukraine" & d$crop == "rapeseed" &  d$Data.ID == 779, 0, d$rape_seed)
+
+  # PULSES CORRECTIONS
+  # Duplicate listing for Burundi (iso ID -- bdi)
+  # Seasonal production of pulses in Burundi: http://www.fao.org/docrep/004/w5956e/w5956e00.htm
+  d$iso <- ifelse( ( d$country_name == "Burundi" & d$crop == "pulses" & d$Data.ID == 83 ), "bdi", d$iso )
+  d$pulses <- ifelse( d$country_name == "Burundi" & d$crop == "pulses" & d$Data.ID == 83, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Burundi" & d$crop == "pulses" & d$Data.ID == 84, 0, d$pulses ) # MZ
+  # Duplicate listing for Ethiopia (iso ID -- eth)
+  # Ethiopia pulse production by month: http://www.fao.org/3/a-at305e.pdf
+  d$iso <- ifelse( ( d$country_name == "Ethiopia" & d$crop == "pulses" & d$Data.ID == 1695 ), "eth", d$iso )
+  d$pulses <- ifelse( d$country_name == "Ethiopia" & d$crop == "pulses" & d$Data.ID == 1695, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Ethiopia" & d$crop == "pulses" & d$Data.ID %in% c(1696, 1697), 0, d$pulses ) # MZ
+  # Duplicate listing for Haiti (iso ID -- hti)
+  # Haiti seasonality for pulse production: https://www.worldpulse.com/fr/node/9391
+  d$iso <- ifelse( ( d$country_name == "Haiti" & d$crop == "pulses" & d$Data.ID == 275 ), "hti", d$iso )
+  d$pulses <- ifelse( d$country_name == "Haiti" & d$crop == "pulses" & d$Data.ID == 275, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Haiti" & d$crop == "pulses" & d$Data.ID %in% c(276, 277), 0, d$pulses ) # MZ
+  # Duplicate listing for Kenya (iso ID -- ken)
+  # Case study of Kenya pulse production: https://www.investinkenya.co.ke/main/view_article/330
+  d$iso <- ifelse( ( d$country_name == "Kenya" & d$crop == "pulses" & d$Data.ID == 341 ), "ken", d$iso )
+  d$pulses <- ifelse( d$country_name == "Kenya" & d$crop == "pulses" & d$Data.ID == 341, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Kenya" & d$crop == "pulses" & d$Data.ID == 342, 0, d$pulses ) # MZ
+  # Duplicate listing for Nicaragua (iso ID -- nic)
+  # Pulse production by month in Nicaragua: http://www.fas.usda.gov/regions/nicaragua
+  d$iso <- ifelse( ( d$country_name == "Nicaragua" & d$crop == "pulses" & d$Data.ID == 446 ), "nic", d$iso )
+  d$pulses <- ifelse( d$country_name == "Nicaragua" & d$crop == "pulses" & d$Data.ID == 446, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Nicaragua" & d$crop == "pulses" & d$Data.ID %in% c(447, 448), 0, d$pulses ) # MZ
+  # Duplicate listing for Rwanda (iso ID -- rwa)
+  # Southern hemisphere; Rwanda pulse seasonality: http://allafrica.com/stories/201410090778.html
+  d$iso <- ifelse( ( d$country_name == "Rwanda" & d$crop == "pulses" & d$Data.ID == 504 ), "rwa", d$iso )
+  d$pulses <- ifelse( d$country_name == "Rwanda" & d$crop == "pulses" & d$Data.ID == 504, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Rwanda" & d$crop == "pulses" & d$Data.ID == 505, 0, d$pulses ) # MZ
+  # Duplicate listing for El Salvador (iso ID -- slv)
+  # Pulse study in Central America: https://www.goift.com/market/americas/central-america/el-salvador-central-america/page/7/
+  d$iso <- ifelse( ( d$country_name == "El Salvador" & d$crop == "pulses" & d$Data.ID == 211 ), "slv", d$iso )
+  d$pulses <- ifelse( d$country_name == "El Salvador" & d$crop == "pulses" & d$Data.ID == 211, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "El Salvador" & d$crop == "pulses" & d$Data.ID == 210, 0, d$pulses ) # MZ
+  # Duplicate listing for Somalia (iso ID -- som)
+  # Seasonality of pulse production in Somalia: https://conservancy.umn.edu/bitstream/handle/11299/163161/JohnsonBrittney.pdf?sequence=1&isAllowed=y
+  d$iso <- ifelse( ( d$country_name == "Somalia" & d$crop == "pulses" & d$Data.ID == 1747 ), "som", d$iso )
+  d$pulses <- ifelse( d$country_name == "Somalia" & d$crop == "pulses" & d$Data.ID == 1747, 1, d$pulses )
+  d$pulses <- ifelse( d$country_name == "Somalia" & d$crop == "pulses" & d$Data.ID == 1748, 0, d$pulses ) # MZ
+  # California has the largest pulse production of all states
+  d$iso <- ifelse( ( d$country_name == "California" & d$crop == "pulses" & d$Data.ID == 1007 ), "usa", d$iso )
+  d$pulses <- ifelse( d$country_name == "California" & d$crop == "pulses" & d$Data.ID == 1007, 1, d$pulses )
+  # Maharashtra, of all Indian states listed (Gujarat, Himachal Pradesh, Kerala, Orissa, Rajasthan, Tamil Nadu), has 34% share of pulse production
+  d$iso <- ifelse( ( d$country_name == "Maharashtra" & d$crop == "pulses" & d$Data.ID == 1424 ), "ind", d$iso )
+  d$pulses <- ifelse( d$country_name == "Maharashtra" & d$crop == "pulses" & d$Data.ID == 1424, 1, d$pulses )
+  # Uganda (South) should be associate with iso ID == uga and Data.ID == 611
+  d$iso <- ifelse( ( d$country_name == "Uganda (South)" & d$crop == "pulses" & d$Data.ID == 611 ), "uga", d$iso )
+  d$pulses <- ifelse( d$country_name == "Uganda (South)" & d$crop == "pulses" & d$Data.ID == 611, 1, d$pulses )
 
   # Remove irrigated and other subcrops
   d$maize <- ifelse(d$maize == 1 & d$Qualifier != "", 0, d$maize) # Remove irrigated and other subcrops
@@ -363,15 +513,6 @@ crop_calendars <- function(output_dir = file.path(getwd(), "output")) {
   # # only Kenya has both cassava and potato production. Potato production ~ 2x higher than cassava, so use cassava
   # d$root_tuber <- ifelse( d$iso == "ken" & d$crop == "cassava", 0, d$root_tuber )
 
-  d.crop.cal <- subset(d, select = c("iso", "wheat", "rice", "maize", "cassava", "soybean", "sugarcane", "sugarbeet", "cotton", "sorghum", "root_tuber", "sunflower", "plant", "harvest"))
-  d.crop.cal <- subset(d.crop.cal, !is.na(iso))
-  d <- NULL
+  return(d)
 
-  save_dir <- file.path(output_dir, "data_processed")
-  if (!dir.exists(save_dir)) {
-    dir.create(save_dir, recursive = TRUE)
-  }
-  utils::write.csv(d.crop.cal, file = file.path(save_dir, "crop_calendar.csv"))
-
-  return(d.crop.cal)
 }
diff --git a/R/data_aggregation.R b/R/data_aggregation.R
index 7bb5964..2f30166 100644
--- a/R/data_aggregation.R
+++ b/R/data_aggregation.R
@@ -29,6 +29,8 @@ data_aggregation <- function(climate_hist_dir = NULL,
     input_file = "crop_calendar.csv"
   )
 
+  crop_select <- names(crop_cal)[!names(crop_cal) %in% c('iso', 'plant', 'harvest')]
+
   # merge fao yield and fao irrigation equip data
   yield <- gaia::merge_data(fao_yield, fao_irr_equip, "iso", "year")
 
@@ -60,7 +62,7 @@ data_aggregation <- function(climate_hist_dir = NULL,
 
     crop_historic <- data.table::data.table()
 
-    for (crop_i in mapping_mirca_sage$crop_mirca) {
+    for (crop_i in crop_select) {
       crop_id <- crop_mirca$crop_id[grepl(crop_i, crop_mirca$crop_name)]
 
       # filter out the file for the crop
@@ -121,7 +123,7 @@ data_aggregation <- function(climate_hist_dir = NULL,
 
   crop_projection <- data.table::data.table()
 
-  for (crop_i in mapping_mirca_sage$crop_mirca) {
+  for (crop_i in crop_select) {
     crop_id <- crop_mirca$crop_id[grepl(crop_i, crop_mirca$crop_name)]
 
     # filter out the file for the crop
diff --git a/R/sysdata.rda b/R/sysdata.rda
index dc9047d..61e2f10 100644
Binary files a/R/sysdata.rda and b/R/sysdata.rda differ
diff --git a/R/yield_impact.R b/R/yield_impact.R
index 2f2fbbc..0b86b50 100644
--- a/R/yield_impact.R
+++ b/R/yield_impact.R
@@ -18,6 +18,8 @@
 #' @param gcam_version Default = 'gcam7'. String for the GCAM version. Only support gcam6 and gcam7
 #' @param gcam_timestep Default = 5. Integer for the time step of GCAM (Select either 1 or 5 years for GCAM use)
 #' @param gcamdata_dir Default = NULL. String for directory to the gcamdata folder within the specific GCAM version. The gcamdata need to be run with drake to have the CSV outputs beforehand.
+#' @param crop_calendar_file Default = NULL. String for the path of the crop calendar file. If crop_cal is provided, crop_select will be set to crops in crop calendar. User provided crop_calendar_file can include any crops MIRCA2000 crops: "wheat", "maize", "rice", "barley", "rye", "millet", "sorghum", "soybean", "sunflower", "root_tuber", "cassava", "sugarcane", "sugarbeet", "oil_palm", "rape_seed", "groundnuts", "pulses", "citrus", "date_palm", "grapes", "cotton", "cocoa", "coffee", "others_perennial", "fodder_grasses", "other_annual"
+#' @param crop_select Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"
 #' @param use_default_coeff Default = FALSE. Binary for using default regression coefficients. Set to TRUE will use the default coefficients instead of calculating coefficients from the historical climate data.
 #' @param base_year Default = 2015. Integer for the base year (for GCAM)
 #' @param start_year Default = NULL. Integer for the  start year of the projected data
@@ -47,6 +49,8 @@ yield_impact <- function(pr_hist_ncdf = NULL,
                          gcam_version = "gcam7",
                          gcam_timestep = 5,
                          gcamdata_dir = NULL,
+                         crop_calendar_file = NULL,
+                         crop_select = NULL,
                          use_default_coeff = FALSE,
                          base_year = 2015,
                          start_year = NULL,
@@ -86,7 +90,13 @@ yield_impact <- function(pr_hist_ncdf = NULL,
 
 
   # Step 2: Generate planting months for each country
-  gaia::crop_calendars(output_dir = output_dir)
+  crop_cal <- gaia::crop_calendars(
+    crop_calendar_file = crop_calendar_file,
+    crop_select = crop_select,
+    output_dir = output_dir)
+
+  # update finalized selected crops
+  crop_select <- names(crop_cal)[!names(crop_cal) %in% c('iso', 'plant', 'harvest')]
 
   # Step 3: Process multiple models to analyze historical weather variables and crop yields
   if (all(
@@ -117,6 +127,7 @@ yield_impact <- function(pr_hist_ncdf = NULL,
   # Step 4: Yield regressions and create figures
   if (!use_default_coeff) {
     gaia::yield_regression(
+      crop_select = crop_select,
       diagnostics = diagnostics,
       output_dir = output_dir
     )
@@ -133,6 +144,7 @@ yield_impact <- function(pr_hist_ncdf = NULL,
     end_year = end_year,
     gcam_timestep = gcam_timestep,
     smooth_window = smooth_window,
+    crop_select = crop_select,
     diagnostics = diagnostics,
     output_dir = output_dir
   )
diff --git a/R/yield_impacts_functions.R b/R/yield_impacts_functions.R
index e3310ba..ac1f58f 100644
--- a/R/yield_impacts_functions.R
+++ b/R/yield_impacts_functions.R
@@ -7,43 +7,89 @@
 #' Function to clean FAO yield data
 #'
 #' @param fao_yield Default = NULL. Data frame for the fao yield table
+#' @param fao_to_mirca Default = NULL. Data frame for the fao to mirca crop mapping
 #' @returns A data frame of formatted FAO yield data
 #' @keywords internal
 #' @export
 
-clean_yield <- function(fao_yield = NULL) {
+clean_yield <- function(fao_yield = NULL,
+                        fao_to_mirca = NULL) {
   AreaName <- crop <- NULL
 
-  d <- subset(fao_yield, select = c("AreaName", "ElementName", "ItemName", "Year", "Value"))
-  d <- subset(d, AreaName != "")
-  d <- subset(d, AreaName != "China, mainland")
-  d <- gaia::colname_replace(d, "AreaName", "country_name")
-  d <- gaia::colname_replace(d, "ElementName", "var")
-  d <- gaia::colname_replace(d, "ItemName", "crop")
-  d <- gaia::colname_replace(d, "Year", "year")
-  d <- gaia::colname_replace(d, "Value", "value")
-  d$crop <- tolower(d$crop)
-  d$var <- gsub("Area harvested", "area_harvest", d$var)
-  d$var <- gsub("Yield", "yield", d$var)
-  d <- subset(d, crop != "cottonseed") # Cotton seed has no data, FAO yields are for seed cotton
-  d$crop <- gsub("rice, paddy", "rice", d$crop)
-  d$crop <- gsub("soybeans", "soybean", d$crop)
-  d$crop <- gsub("sugar cane", "sugarcane", d$crop)
-  d$crop <- gsub("sugar beet", "sugarbeet", d$crop)
-  d$crop <- gsub("seed cotton", "cotton", d$crop)
-  d$crop <- gsub("sunflower seed", "sunflower", d$crop)
-  d$crop <- gsub(" ", "_", d$crop)
-  d <- data.table::dcast(d, country_name + crop + year ~ var, value.var = "value")
-  d <- merge(d, mapping_gcam_iso, by = "country_name", all.x = TRUE)
-  d <- gaia::iso_replace(d)
-  #   For these countries, root_tuber == cassava; others root_tuber == potato
-  d$crop <- ifelse(d$crop == "potatoes", "root_tuber", d$crop)
-  #   d$crop <- ifelse( ( ( d$iso == "ben" | d$iso ==  "cmr" | d$iso == "caf" | d$iso == "cog" | d$iso == "cod" | d$iso == "civ" | d$iso == "gab" | d$iso == "gha" |
-  #                           d$iso == "gin" | d$iso == "lbr" | d$iso == "nga" | d$iso == "sle" | d$iso == "som" | d$iso == "tgo" ) & d$crop == "cassava" ), "root_tuber", d$crop )
-  #   d$crop <- ifelse( ( ( d$iso == "ben" | d$iso ==  "cmr" | d$iso == "caf" | d$iso == "cog" | d$iso == "cod" | d$iso == "civ" | d$iso == "gab" | d$iso == "gha" |
-  #                           d$iso == "gin" | d$iso == "lbr" | d$iso == "nga" | d$iso == "sle" | d$iso == "som" | d$iso == "tgo" ) & d$crop == "potatoes" ), "potatoes", d$crop )
-  d <- subset(d, select = c("iso", "crop", "year", "area_harvest", "yield"))
-  return(d)
+  # clean up FAOSTAT yield and harvest area data and left join the iso code
+  df <- fao_yield %>%
+    dplyr::filter(`Item Code` %in% unique(fao_to_mirca$fao_crop_id),
+                  Element %in% c('Area harvested', 'Yield'),
+                  # Exclude entire China because FAOSTAT distinguishes China mainland, Taiwan, and Hongkong
+                  # GCAM also has Taiwan as individual region
+                  !Area %in% c('China'))  %>%
+    dplyr::select(country_code = `Area Code`, country_name = Area, var = Element,
+                  fao_crop_name = Item, fao_crop_id = `Item Code`,
+                  paste0('Y', 1961:2020)) %>%
+    dplyr::left_join(fao_iso %>% dplyr::select(country_code, iso),
+                     by = 'country_code') %>%
+    dplyr::filter(!is.na(iso))
+
+  # clean up the iso code which megers some FAOSTAT specific 'f' iso codes
+  df <- gaia::iso_replace(df)
+
+  # restructure data
+  df <- df %>%
+    tidyr::pivot_longer(cols = paste0('Y', 1961:2020),
+                        names_to = 'year', values_to = 'value') %>%
+    dplyr::mutate(year = as.integer(gsub('Y', '', year)),
+                  var = dplyr::case_when(var == 'Yield' ~ 'yield',
+                                         var == 'Area harvested' ~ 'area_harvest')) %>%
+    tidyr::pivot_wider(names_from = var, values_from = value)
+
+  # Join FAO to MIRCA2000 crops and aggregated by country, MIRCA2000 crop type
+  # for harvest areas, take sum of all FAO crops for each MIRCA2000 crop
+  # for yields, take mean of all FAO crops for each MIRCA2000 crop
+  df_mirca <- df %>%
+    dplyr::left_join(fao_to_mirca %>%
+                       tidyr::pivot_longer(cols = paste('crop', sprintf('%02d', 1:26), sep = ''),
+                                           names_to = 'crop_id', values_to = 'to_mirca') %>%
+                       dplyr::filter(to_mirca == 'X') %>%
+                       dplyr::left_join(crop_mirca %>% dplyr::select(crop_id, crop = crop_name)) %>%
+                       dplyr::select(fao_crop_id, crop) %>%
+                       dplyr::distinct(),
+                     by = 'fao_crop_id') %>%
+    dplyr::group_by(iso, crop, year) %>%
+    dplyr::summarise(area_harvest = sum(area_harvest, na.rm = T),
+                     yield = mean(yield, na.rm = T)) %>%
+    dplyr::filter(area_harvest > 0, yield > 0)
+  #
+  #
+  # d <- subset(fao_yield, select = c("Area", "Element", "Item", "Year", "Value"))
+  # d <- subset(d, Area != "")
+  # d <- subset(d, Area != "China, mainland")
+  # d <- gaia::colname_replace(d, "Area", "country_name")
+  # d <- gaia::colname_replace(d, "Element", "var")
+  # d <- gaia::colname_replace(d, "Item", "crop")
+  # d <- gaia::colname_replace(d, "Year", "year")
+  # d <- gaia::colname_replace(d, "Value", "value")
+  # d$crop <- tolower(d$crop)
+  # d$var <- gsub("Area harvested", "area_harvest", d$var)
+  # d$var <- gsub("Yield", "yield", d$var)
+  # d <- subset(d, crop != "cottonseed") # Cotton seed has no data, FAO yields are for seed cotton
+  # d$crop <- gsub("rice, paddy", "rice", d$crop)
+  # d$crop <- gsub("soybeans", "soybean", d$crop)
+  # d$crop <- gsub("sugar cane", "sugarcane", d$crop)
+  # d$crop <- gsub("sugar beet", "sugarbeet", d$crop)
+  # d$crop <- gsub("seed cotton, unginned", "cotton", d$crop)
+  # d$crop <- gsub("sunflower seed", "sunflower", d$crop)
+  # d$crop <- gsub(" ", "_", d$crop)
+  # d <- data.table::dcast(d, country_name + crop + year ~ var, value.var = "value")
+  # d <- merge(d, mapping_gcam_iso, by = "country_name", all.x = TRUE)
+  # d <- gaia::iso_replace(d)
+  # #   For these countries, root_tuber == cassava; others root_tuber == potato
+  # d$crop <- ifelse(d$crop == "potatoes", "root_tuber", d$crop)
+  # #   d$crop <- ifelse( ( ( d$iso == "ben" | d$iso ==  "cmr" | d$iso == "caf" | d$iso == "cog" | d$iso == "cod" | d$iso == "civ" | d$iso == "gab" | d$iso == "gha" |
+  # #                           d$iso == "gin" | d$iso == "lbr" | d$iso == "nga" | d$iso == "sle" | d$iso == "som" | d$iso == "tgo" ) & d$crop == "cassava" ), "root_tuber", d$crop )
+  # #   d$crop <- ifelse( ( ( d$iso == "ben" | d$iso ==  "cmr" | d$iso == "caf" | d$iso == "cog" | d$iso == "cod" | d$iso == "civ" | d$iso == "gab" | d$iso == "gha" |
+  # #                           d$iso == "gin" | d$iso == "lbr" | d$iso == "nga" | d$iso == "sle" | d$iso == "som" | d$iso == "tgo" ) & d$crop == "potatoes" ), "potatoes", d$crop )
+  # d <- subset(d, select = c("iso", "crop", "year", "area_harvest", "yield"))
+  return(df_mirca)
 }
 
 
@@ -241,7 +287,7 @@ data_merge <- function(data = NULL,
     data = d,
     save_path = file.path(output_dir, "data_processed"),
     file_name = paste0("historic_vars_", crop_name, ".csv"),
-    data_info = "Merged yield data"
+    data_info = "Merged historical yield data"
   )
 
   return(d)
@@ -671,6 +717,16 @@ climate_impact <- function(use_default_coeff = FALSE,
   # create base year vector
   baseYears <- c(paste("X", (start_year:end_year), sep = ""))
 
+  # check if the base year exists at all
+  missing_year <- setdiff(baseYears, colnames(d))
+  if(length(missing_year) > 0){
+
+    for(missing_year_i in missing_year){
+      d[[missing_year_i]] <- NA
+    }
+
+  }
+
   # if start year from the available data is 9 years or earlier before the base year
   # then historical years are determined as (base_year - 9) to base_year
   # else historical years are determined as start_year to base year
diff --git a/R/yield_regression.R b/R/yield_regression.R
index d313a5e..e5bd794 100644
--- a/R/yield_regression.R
+++ b/R/yield_regression.R
@@ -4,12 +4,14 @@
 #' using average growing season temperature and precipitation, max and min months
 #'
 #' @param formula Default = NULL. String for regression formula
+#' @param crop_select Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"
 #' @param diagnostics Default = TRUE. Logical for performing diagnostic plot
 #' @param output_dir Default = file.path(getwd(), 'output'). String for output directory
 #' @returns No return value, called for the side effects of processing and writing output files
 #' @export
 
 yield_regression <- function(formula = NULL,
+                             crop_select = NULL,
                              diagnostics = TRUE,
                              output_dir = file.path(getwd(), "output")) {
   # Decided not to use irrigation equipped land because data is so spotty
@@ -24,8 +26,12 @@ yield_regression <- function(formula = NULL,
   message(paste0("Starting step: yield_regression"))
   message(paste0("Using formula: ", formula))
 
+  if(is.null(crop_select)){
+    crop_select <- gaia::verify_crop(crop_select = crop_select)$crop_mirca
+  }
+
   # conduct regression for each crop
-  for (crop_i in mapping_mirca_sage$crop_mirca) {
+  for (crop_i in crop_select) {
     # read in the historical crop data and weather data
     d_crop <- gaia::input_data(
       folder_path = file.path(output_dir, "data_processed"),
diff --git a/R/yield_shock_projection.R b/R/yield_shock_projection.R
index 9250169..41cc741 100644
--- a/R/yield_shock_projection.R
+++ b/R/yield_shock_projection.R
@@ -11,6 +11,7 @@
 #' @param end_year Default = NULL. Integer for the end year of the data
 #' @param gcam_timestep Default = 5. Integer for the time step of GCAM (Select either 1 or 5 years for GCAM use)
 #' @param smooth_window Default = 20. Integer for smoothing window in years
+#' @param crop_select Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"
 #' @param diagnostics Default = TRUE. Logical for performing diagnostic plot
 #' @param output_dir Default = file.path(getwd(), 'output'). String for output directory
 #' @returns A data frame of formatted smoothed annual crop yield shocks under climate impacts
@@ -24,6 +25,7 @@ yield_shock_projection <- function(use_default_coeff = FALSE,
                                    end_year = NULL,
                                    gcam_timestep = 5,
                                    smooth_window = 20,
+                                   crop_select = NULL,
                                    diagnostics = TRUE,
                                    output_dir = file.path(getwd(), "output")) {
   message("Starting Step: yield_shock_projection")
@@ -40,8 +42,11 @@ yield_shock_projection <- function(use_default_coeff = FALSE,
     stringsAsFactors = FALSE
   )
 
+  if(is.null(crop_select)){
+    crop_select <- gaia::verify_crop(crop_select = crop_select)$crop_mirca
+  }
 
-  for (crop_i in mapping_mirca_sage$crop_mirca) {
+  for (crop_i in crop_select) {
     print(paste(climate_model, climate_scenario, crop_i, sep = " "))
 
     # calculate yield impact for each crop and country
diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R
index d85e1e8..47b7b84 100644
--- a/data-raw/DATASET.R
+++ b/data-raw/DATASET.R
@@ -293,17 +293,6 @@ sage <- sage %>%
   dplyr::select(-V1)
 
 
-#-------------------------------------------------------------------------------
-# FAO Data - Harvested Area
-#-------------------------------------------------------------------------------
-fao_yield <- gaia::input_data(
-  folder_path = file.path(model_data.dir, 'data_raw'),
-  input_file = 'FAO_yield_ha.csv',
-  skip_number = 0
-)
-
-fao_yield <- gaia::clean_yield(fao_yield)
-
 #-------------------------------------------------------------------------------
 # FAO Data - Irrigation Equip
 #-------------------------------------------------------------------------------
@@ -330,38 +319,84 @@ gdp <- gdp %>%
   dplyr::rename(gdp_pcap_ppp = gdp_pcap_ppp_thous) %>%
   dplyr::select(-V1)
 
+#-------------------------------------------------------------------------------
+# Mapping of FAO crops to MIRCA2000
+#-------------------------------------------------------------------------------
+
+# Source: supporting information from Portmann, F. T., Siebert, S., & Döll, P. (2010). MIRCA2000—Global monthly irrigated and rainfed crop areas around the year 2000: A new high‐resolution data set for agricultural and hydrological modeling. Global biogeochemical cycles, 24(1). https://doi.org/10.1029/2008GB003435
+# Manually added the FAO Item Code based on the FAOSTAT data https://www.fao.org/faostat/en/#data/QCL, accessed Feb 2025
+# Note that FAOSTAT has updated the crop names/groups since MIRCA2000, so there are few FAO crops listed in Portmann et al., 2010 are not available from the latest FAOSTAT.
+fao_to_mirca <- gaia::input_data(
+  folder_path = file.path(model_data.dir, 'data_ext'),
+  input_file = 'mirca2000_fao_crop_mapping_edit.csv',
+  skip_number = 0) %>%
+  dplyr::filter(!is.na(fao_crop_id))
+# colnames(fao_to_mirca) <- c('fao_crop', paste('crop', sprintf('%02d', 1:26), sep = ''))
+
+#-------------------------------------------------------------------------------
+# FAOSTAT Country ISO mapping
+#-------------------------------------------------------------------------------
+fao_iso <- gaia::input_data(
+  folder_path = file.path(model_data.dir, 'data_ext', 'FAO_Production_Crops_Livestock_E_All_Data'),
+  input_file = 'FAOSTAT_Area_Code.csv',
+  skip_number = 0) %>%
+  dplyr::select(country_code = `Country Code`,
+                country_name = Country,
+                iso = `ISO3 Code`) %>%
+  dplyr::mutate(iso = tolower(iso)) %>%
+  dplyr::distinct()
+
+
+#-------------------------------------------------------------------------------
+# FAO Data - Harvested Area
+#-------------------------------------------------------------------------------
+# fao_yield <- gaia::input_data(
+#   folder_path = file.path(model_data.dir, 'data_raw'),
+#   input_file = 'FAO_yield_ha.csv',
+#   skip_number = 0
+# )
+
+# Latest full FAO production data from https://www.fao.org/faostat/en/#data/QCL, accessed Feb 2025
+fao_yield <- gaia::input_data(
+  folder_path = file.path(model_data.dir, 'data_ext', 'FAO_Production_Crops_Livestock_E_All_Data'),
+  input_file = 'Production_Crops_Livestock_E_All_Data_NOFLAG.csv',
+  skip_number = 0
+)
+
+fao_yield <- gaia::clean_yield(fao_yield = fao_yield,
+                               fao_to_mirca = fao_to_mirca)
 
 #-------------------------------------------------------------------------------
 # MIRCA Crop Mapping
 #-------------------------------------------------------------------------------
 crop_mirca <- tibble::tribble(
-  ~ crop_id, ~ crop, ~ crop_name,
-  "crop01", "wheat", "wheat",
-  "crop02", "maize", "maize",
-  "crop03", "rice", "rice",
-  "crop04", "barley", NA,
-  "crop05", "rye", NA,
-  "crop06", "millet", NA,
-  "crop07", "sorghum", "sorghum",
-  "crop08", "soybean", "soybean",
-  "crop09", "sunflower", "sunflower",
-  "crop10", "potatoes", "root_tuber",
-  "crop11", "cassava", "cassava",
-  "crop12", "sugarcane", "sugarcane",
-  "crop13", "sugarbeet", "sugarbeet",
-  "crop14", "oil palm", NA,
-  "crop15", "rape seed,canola", NA,
-  "crop16", "groundnuts,peanuts", NA,
-  "crop17", "pulses", NA,
-  "crop18", "citrus", NA,
-  "crop19", "date palm", NA,
-  "crop20", "grapes,vine", NA,
-  "crop21", "cotton", "cotton",
-  "crop22", "cocoa", NA,
-  "crop23", "coffee", NA,
-  "crop24", "others perennial", NA,
-  "crop25", "fodder grasses", NA,
-  "crop26", "other annual", NA
+  ~ crop_id, ~ crop,              ~ crop_name,        ~ crop_sage,
+  "crop01", "wheat",              "wheat",            "wheat",
+  "crop02", "maize",              "maize",            "maize",
+  "crop03", "rice",               "rice",             "rice",
+  "crop04", "barley",             "barley",           "barley",
+  "crop05", "rye",                "rye",              "rye",
+  "crop06", "millet",             "millet",           "millet",
+  "crop07", "sorghum",            "sorghum",          "sorghum",
+  "crop08", "soybean",            "soybean",          "soybeans",
+  "crop09", "sunflower",          "sunflower",        "sunflower",
+  "crop10", "potatoes",           "root_tuber",       "potatoes",
+  "crop11", "cassava",            "cassava",          "cassava",
+  "crop12", "sugarcane",          "sugarcane",        "sugarcane",
+  "crop13", "sugarbeet",          "sugarbeet",        "sugarbeets",
+  "crop14", "oil palm",           "oil_palm",         NA,
+  "crop15", "rape seed,canola",   "rape_seed",        "rapeseed",
+  "crop16", "groundnuts,peanuts", "groundnuts",       "groundnuts",
+  "crop17", "pulses",             "pulses",           "pulses",
+  "crop18", "citrus",             "citrus",           NA,
+  "crop19", "date palm",          "date_palm",        NA,
+  "crop20", "grapes,vine",        "grapes",           NA,
+  "crop21", "cotton",             "cotton",           "cotton",
+  "crop22", "cocoa",              "cocoa",            NA,
+  "crop23", "coffee",             "coffee",           NA,
+  "crop24", "others perennial",   "others_perennial", NA,
+  "crop25", "fodder grasses",     "fodder_grasses",   NA,
+  "crop26", "others annual",      "others_annual",    NA
 )
 
 
@@ -374,24 +409,41 @@ gcam_commod <- tibble::tribble(
   "biomass", "maize", "Grass",
   "biomass", "sorghum", "Grass",
   "biomass", "sugarcane", "Grass",
+  "biomass", "fodder_grasses", "Grass",
   "Corn", "maize", "C4",
   "FiberCrop", "cotton", "",
   "FodderHerb", "maize", "C4",
   "FodderHerb", "sorghum", "C4",
   "Fruits", "rice", "",
   "Fruits", "wheat", "",
+  "Fruits", "citrus", "Tree",
+  "Fruits", "date_palm", "Tree",
+  "Fruits", "grapes", "",
   "Legumes", "maize", "",
   "Legumes", "wheat", "",
+  "Legumes", "pulses", "",
   "MiscCrop", "rice", "",
   "MiscCrop", "wheat", "",
+  "MiscCrop", "coffee", "Tree",
+  "MiscCrop", "cocoa", "Tree",
+  "MiscCrop", "others_perennial", "",
+  "MiscCrop", "others_annual", "",
   "NutsSeeds", "rice", "",
   "NutsSeeds", "wheat", "",
+  "NutsSeeds", "groundnuts", "",
   "OilCrop", "soybean", "",
+  "OilCrop", "sunflower", "",
+  "OilCrop", "rape_seed", "",
+  "OilCrop", "groundnuts", "",
   "OilPalm", "rice", "",
-  "OilPalm", 'wheat', "",
+  "OilPalm", "wheat", "",
+  "OilPalm", "oil_palm", "Tree",
   "OtherGrain", "rice", "",
   "OtherGrain", "sorghum", "C4",
   "OtherGrain", "wheat", "",
+  "OtherGrain", "barley", "",
+  "OtherGrain", "rye", "",
+  "OtherGrain", "millet", "C4",
   "Rice", "rice", "",
   "RootTuber", "root_tuber", "",
   "RootTuber", "potato", "",
@@ -542,5 +594,5 @@ usethis::use_data(mapping_country, grid_fao_glu, mapping_fao_glu, mapping_gcam_i
                   mirca_harvest_area, sage, fao_yield, fao_irr_equip, gdp,
                   waldhoff_formula, y_hat, reg_vars, weight_var, n_sig, fit_name,
                   col_scale_region, col_fill_region, theme_basic,
-                  map_country,
+                  map_country, fao_to_mirca, fao_iso,
                   internal = TRUE, overwrite = TRUE )
diff --git a/man/clean_sage.Rd b/man/clean_sage.Rd
new file mode 100644
index 0000000..96eabac
--- /dev/null
+++ b/man/clean_sage.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/crop_calendars.R
+\name{clean_sage}
+\alias{clean_sage}
+\title{clean_sage}
+\usage{
+clean_sage(d = NULL)
+}
+\arguments{
+\item{d}{Default = NULL. data frame for crop calendar}
+}
+\value{
+A data frame of clean crop calendar
+}
+\description{
+Clean up the sage crop calendar by reassigning the iso name and
+make sure to keep one set of the plant and harvest for each country and crop
+}
+\keyword{internal}
diff --git a/man/clean_yield.Rd b/man/clean_yield.Rd
index 701b70e..fdb9cd7 100644
--- a/man/clean_yield.Rd
+++ b/man/clean_yield.Rd
@@ -4,10 +4,12 @@
 \alias{clean_yield}
 \title{clean_yield}
 \usage{
-clean_yield(fao_yield = NULL)
+clean_yield(fao_yield = NULL, fao_to_mirca = NULL)
 }
 \arguments{
 \item{fao_yield}{Default = NULL. Data frame for the fao yield table}
+
+\item{fao_to_mirca}{Default = NULL. Data frame for the fao to mirca crop mapping}
 }
 \value{
 A data frame of formatted FAO yield data
diff --git a/man/crop_calendars.Rd b/man/crop_calendars.Rd
index a03b1e0..94822f1 100644
--- a/man/crop_calendars.Rd
+++ b/man/crop_calendars.Rd
@@ -4,9 +4,17 @@
 \alias{crop_calendars}
 \title{crop_calendars}
 \usage{
-crop_calendars(output_dir = file.path(getwd(), "output"))
+crop_calendars(
+  crop_calendar_file = NULL,
+  crop_select = NULL,
+  output_dir = file.path(getwd(), "output")
+)
 }
 \arguments{
+\item{crop_calendar_file}{Default = NULL. String for the path of the crop calendar file. If crop_cal is provided, crop_select will be set to crops in crop calendar. User provided crop_calendar_file can include any crops MIRCA2000 crops: "wheat", "maize", "rice", "barley", "rye", "millet", "sorghum", "soybean", "sunflower", "root_tuber", "cassava", "sugarcane", "sugarbeet", "oil_palm", "rape_seed", "groundnuts", "pulses", "citrus", "date_palm", "grapes", "cotton", "cocoa", "coffee", "others_perennial", "fodder_grasses", "others_annual"}
+
+\item{crop_select}{Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"}
+
 \item{output_dir}{Default = file.path(getwd(), 'output'). String for output directory}
 }
 \value{
diff --git a/man/verify_crop.Rd b/man/verify_crop.Rd
new file mode 100644
index 0000000..bafcd51
--- /dev/null
+++ b/man/verify_crop.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/crop_calendars.R
+\name{verify_crop}
+\alias{verify_crop}
+\title{verify_crop}
+\usage{
+verify_crop(
+  crop_calendar_file = NULL,
+  crop_select = c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum",
+    "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat")
+)
+}
+\arguments{
+\item{crop_calendar_file}{Default = NULL. String for the path of the crop calendar file. If crop_cal is provided, crop_select will be set to crops in crop calendar. User provided crop_calendar_file can include any crops MIRCA2000 crops: "wheat", "maize", "rice", "barley", "rye", "millet", "sorghum", "soybean", "sunflower", "root_tuber", "cassava", "sugarcane", "sugarbeet", "oil_palm", "rape_seed", "groundnuts", "pulses", "citrus", "date_palm", "grapes", "cotton", "cocoa", "coffee", "others_perennial", "fodder_grasses", "other_annual"}
+
+\item{crop_select}{Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"}
+}
+\value{
+A data frame of crop name mapping between MIRCA and SAGE
+}
+\description{
+Identify crops selected by users are available crop types from both SAGE and MIRCA
+}
+\keyword{internal}
diff --git a/man/yield_impact.Rd b/man/yield_impact.Rd
index 54c8101..451510a 100644
--- a/man/yield_impact.Rd
+++ b/man/yield_impact.Rd
@@ -21,6 +21,8 @@ yield_impact(
   gcam_version = "gcam7",
   gcam_timestep = 5,
   gcamdata_dir = NULL,
+  crop_calendar_file = NULL,
+  crop_select = NULL,
   use_default_coeff = FALSE,
   base_year = 2015,
   start_year = NULL,
@@ -65,6 +67,10 @@ yield_impact(
 
 \item{gcamdata_dir}{Default = NULL. String for directory to the gcamdata folder within the specific GCAM version. The gcamdata need to be run with drake to have the CSV outputs beforehand.}
 
+\item{crop_calendar_file}{Default = NULL. String for the path of the crop calendar file. If crop_cal is provided, crop_select will be set to crops in crop calendar. User provided crop_calendar_file can include any crops MIRCA2000 crops: "wheat", "maize", "rice", "barley", "rye", "millet", "sorghum", "soybean", "sunflower", "root_tuber", "cassava", "sugarcane", "sugarbeet", "oil_palm", "rape_seed", "groundnuts", "pulses", "citrus", "date_palm", "grapes", "cotton", "cocoa", "coffee", "others_perennial", "fodder_grasses", "other_annual"}
+
+\item{crop_select}{Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"}
+
 \item{use_default_coeff}{Default = FALSE. Binary for using default regression coefficients. Set to TRUE will use the default coefficients instead of calculating coefficients from the historical climate data.}
 
 \item{base_year}{Default = 2015. Integer for the base year (for GCAM)}
diff --git a/man/yield_regression.Rd b/man/yield_regression.Rd
index 89ab201..ef4d5ce 100644
--- a/man/yield_regression.Rd
+++ b/man/yield_regression.Rd
@@ -6,6 +6,7 @@
 \usage{
 yield_regression(
   formula = NULL,
+  crop_select = NULL,
   diagnostics = TRUE,
   output_dir = file.path(getwd(), "output")
 )
@@ -13,6 +14,8 @@ yield_regression(
 \arguments{
 \item{formula}{Default = NULL. String for regression formula}
 
+\item{crop_select}{Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"}
+
 \item{diagnostics}{Default = TRUE. Logical for performing diagnostic plot}
 
 \item{output_dir}{Default = file.path(getwd(), 'output'). String for output directory}
diff --git a/man/yield_shock_projection.Rd b/man/yield_shock_projection.Rd
index cd7ee22..0e03636 100644
--- a/man/yield_shock_projection.Rd
+++ b/man/yield_shock_projection.Rd
@@ -13,6 +13,7 @@ yield_shock_projection(
   end_year = NULL,
   gcam_timestep = 5,
   smooth_window = 20,
+  crop_select = NULL,
   diagnostics = TRUE,
   output_dir = file.path(getwd(), "output")
 )
@@ -34,6 +35,8 @@ yield_shock_projection(
 
 \item{smooth_window}{Default = 20. Integer for smoothing window in years}
 
+\item{crop_select}{Default = NULL. Vector of strings for the selected crops from our database. If NULL, the default crops will be used in the crop calendar: c("cassava", "cotton", "maize", "rice", "root_tuber", "sorghum", "soybean", "sugarbeet", "sugarcane", "sunflower", "wheat"). The additional crops available for selection from our crop calendar database are: "barley", "groundnuts", "millet", "pulses", "rape_seed", "rye"}
+
 \item{diagnostics}{Default = TRUE. Logical for performing diagnostic plot}
 
 \item{output_dir}{Default = file.path(getwd(), 'output'). String for output directory}
diff --git a/vignettes/vignette.R b/vignettes/vignette.R
index d861dea..6dd8d99 100644
--- a/vignettes/vignette.R
+++ b/vignettes/vignette.R
@@ -14,7 +14,8 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 #    download_url = 'https://zenodo.org/records/13976521/files/weighted_climate.zip?download=1',
 #    data_dir = output_dir
 #  )
-#  
+
+## ----eval=F-------------------------------------------------------------------
 #  # Path to the folder that holds cropland-area-weighted precipitation and temperature TXT files
 #  # historical climate observation
 #  climate_hist_dir <- file.path(data_dir, 'climate_hist')
@@ -36,6 +37,8 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 #    data_dir = output_dir
 #  )
 #  
+
+## ----eval=F-------------------------------------------------------------------
 #  # Path to the precipitation and temperature NetCDF files
 #  # NOTE: Each variable can have more than one file
 #  # projected climate data
@@ -201,6 +204,16 @@ knitr::kable(crop_cal_update[1:10],
   kable_styling(bootstrap_options = "striped", full_width = T, position = 'center') %>% 
   footnote(general = 'This only shows the first 10 lines of the example data.')
 
+## ----eval=F, echo=T-----------------------------------------------------------
+#  
+#  # Path to the output folder where you wish to save the outputs. Change it accordingly
+#  output_dir <- 'gaia_output'
+#  
+#  # crop calendars
+#  crop_cal <- crop_calendars(crop_calendar_file = 'path/to/your/crop/calendar/file',
+#                             output_dir = output_dir)
+#  
+
 ## ----eval=F, echo=T-----------------------------------------------------------
 #  
 #  # Path to the output folder where you wish to save the outputs. Change it accordingly
diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd
index 35818b6..796579f 100644
--- a/vignettes/vignette.Rmd
+++ b/vignettes/vignette.Rmd
@@ -525,6 +525,19 @@ knitr::kable(crop_cal_update[1:10],
   footnote(general = 'This only shows the first 10 lines of the example data.')
 ```
 
+If you plan to use your own crop calendar data, make sure it follows the format in [Table 3](#table3). Then, simply provide the path to your crop calendar file in the `crop_calendar` function to run and the subsequent steps will use the crop calendar data you provided.
+
+```{r eval=F, echo=T}
+
+# Path to the output folder where you wish to save the outputs. Change it accordingly
+output_dir <- 'gaia_output'
+
+# crop calendars
+crop_cal <- crop_calendars(crop_calendar_file = 'path/to/your/crop/calendar/file',
+                           output_dir = output_dir)
+
+```
+
 <br />
 
 ### data_aggregation