#' Summarize Results
#'
#' Get adjusted totals and prevalence for provided variables.
#'
#' @param df data.frame with sample and weights (if using a survey design)
#' @param vars string vector of variables to calculate prevalences for
#' @param weight_col string specifying the column with weights or NULL for unweighted
#' @param id_col string specifying the column with IDs for cluster-aware standard error (SE) calculations
#' @param strata_col string specifying the column with strata for cluster-aware SE calculations
#' @param label string label for weighting method
#'
#' @return data.frame with totals, means, and standard errors (if using a survey design)
#' 
#' @examples
#' # Prepare the NHIS data
#' calVars <- c(
#'   "SEX_A_R", "AGEP_A_R", "HISPALLP_A_R", "ORIENT_A_R", "HICOV_A_R", "EDUCP_A_R", "REGION_R",
#'   "EMPLASTWK_A_R", "HOUTENURE_A_R", "MARITAL_A_R"
#' )
#' stuVars <- "DIBTYPE_A_R"
#' vars_dummies <- c("AGEP_A_R","HISPALLP_A_R","EDUCP_A_R","REGION_R")
#' nhis_keep_vars <- c("PPSU","PSTRAT","WTFA_A")
#' nhis_imputed <- impute_data(nhis_processed, c(calVars, stuVars), nhis_keep_vars)
#' nhis_dummied <- dummies(nhis_imputed, vars=paste0(vars_dummies, '_I'))
#' factor_vars <- setdiff(names(nhis_dummied), nhis_keep_vars)
#' nhis_dummied[factor_vars] <- lapply(nhis_dummied[factor_vars], as.factor)
#' 
#' # Prepare the synthetic All of Us data
#' aou_imputed <- impute_data(aou_synthetic, c(calVars, stuVars))
#' aou_dummied <- dummies(aou_imputed, vars=paste0(vars_dummies, '_I'))
#' aou_dummied[] <- lapply(aou_dummied, as.factor)
#' 
#' # Calculate IPW weights using NHIS data and applied to All of Us
#' weights_df <- calculate_weights(
#'   nhis_dummied, 
#'   nhis_dummied, 
#'   'ipw',
#'   paste0(calVars, '_I'), 
#'   paste0(stuVars, '_I'), 
#'   weight='WTFA_A',
#'   strata='PSTRAT',
#'   psu='PPSU'
#' )
#' 
# Get IPW results
#' results_ipw <- summarize_results(
#'   weights_df,
#'   c(paste0(calVars, '_I'), paste0(stuVars, '_I')), 
#'   weight_col='ipw_weight', 
#'   label='AoU: IPW'
#' )
#' 
#' @import dplyr
#' @importFrom stats as.formula
#' @importFrom purrr map_dfr
#' @importFrom survey svytotal svymean svydesign
#' @export
summarize_results <- function(df, vars, weight_col=NULL, id_col=1, strata_col=NULL, label=NULL) {
    
    if(!is.null(weight_col)) {
        options(survey.lonely.psu = "adjust")

        if(!is.null(strata_col)) {
            # Convert column to factor
            df[[strata_col]] <- factor(df[[strata_col]])
            # Reformat into formula
            strata_col <- as.formula(paste0('~', strata_col))
        }
        survey_design <- svydesign(
            ids = as.formula(paste0('~', id_col)),
            strata = strata_col,
            weights = as.formula(paste0('~', weight_col)), 
            data = df,
            nest = TRUE
        )
    }
    
    results <- map_dfr(vars, ~{

        if(is.null(weight_col)) {   
            # Unweighted frequencies
            estm0 <- as.data.frame(table(df[[.x]]))
            estm0$Prop <- estm0$Freq / sum(estm0$Freq)  # Unweighted mean/proportion
            
            results.x <- data.frame(
                VARNAME = paste0(.x, estm0$Var1),
                TOT = estm0$Freq,
                MEAN = estm0$Prop
            )
            
        } else {
            # Weighted totals
            estm <- as.data.frame(svytotal(as.formula(paste0("~", .x)), survey_design))
            # Weighted means
            estmn <- as.data.frame(svymean(as.formula(paste0("~", .x)), survey_design))
            
            results.x <- data.frame(
                VARNAME = rownames(estm),
                TOT = estm$total,
                SETOT = estm$SE,
                MEAN = estmn$mean,
                SEMEAN = estmn$SE
            )
        }
        
        results.x
    })
    
        if(!is.null(label))
            results$w_method = label
        
    return(results)
}


#' Summarize Results by Group
#'
#' Get adjusted totals and prevalences for provided variables, grouped by specified variables.
#'
#' TODO: Merge into regular summarize_results function
#'
#' @param df data.frame with sample and weights (if using a survey design)
#' @param vars string vector of variables to calculate prevalences for
#' @param group_vars string vector of variables to group by
#' @param weight_col string specifying the column with weights, "nhis" or nhis survey design, or NULL for unweighted
#' @param id_col string specifying the column with IDs for cluster-aware standard error (SE) calculations
#' @param strata_col string specifying the column with strata for cluster-aware SE calculations
#' @param label string label for weighting method
#'
#' @return data.frame with totals, means, and standard errors (if using a survey design)
#' 
#' @examples
#' # Prepare the NHIS data
#' calVars <- c(
#'   "SEX_A_R", "AGEP_A_R", "HISPALLP_A_R", "ORIENT_A_R", "HICOV_A_R", "EDUCP_A_R", "REGION_R",
#'   "EMPLASTWK_A_R", "HOUTENURE_A_R", "MARITAL_A_R"
#' )
#' stuVars <- "DIBTYPE_A_R"
#' vars_dummies <- c("AGEP_A_R","HISPALLP_A_R","EDUCP_A_R","REGION_R")
#' nhis_keep_vars <- c("PPSU","PSTRAT","WTFA_A")
#' nhis_imputed <- impute_data(nhis_processed, c(calVars, stuVars), nhis_keep_vars)
#' nhis_dummied <- dummies(nhis_imputed, vars=paste0(vars_dummies, '_I'))
#' factor_vars <- setdiff(names(nhis_dummied), nhis_keep_vars)
#' nhis_dummied[factor_vars] <- lapply(nhis_dummied[factor_vars], as.factor)
#' 
#' # Prepare the synthetic All of Us data
#' aou_imputed <- impute_data(aou_synthetic, c(calVars, stuVars))
#' aou_dummied <- dummies(aou_imputed, vars=paste0(vars_dummies, '_I'))
#' aou_dummied[] <- lapply(aou_dummied, as.factor)
#' 
#' # Calculate IPW weights using NHIS data and applied to All of Us
#' weights_df <- calculate_weights(
#'   nhis_dummied, 
#'   aou_dummied, 
#'   'ipw',
#'   paste0(calVars, '_I'), 
#'   paste0(stuVars, '_I'), 
#'   weight='WTFA_A',
#'   strata='PSTRAT',
#'   psu='PPSU'
#' )
#' 
#' # Get IPW results by group
#' ipw_outcome_df <- summarize_results_by_group(
#'   weights_df, 
#'   paste0(stuVars, '_I'), 
#'   paste0(calVars, '_I'), 
#'   weight_col='ipw_weight', 
#'   label='AoU: IPW'
#' )
#' 
#' @import dplyr
#' @importFrom stats as.formula
#' @importFrom survey svytotal svymean svydesign svyby
#' @export
summarize_results_by_group <- function(df, vars, group_vars, weight_col=NULL, id_col=NULL, strata_col=NULL, label=NULL) {
    
    if(is.null(weight_col)) {
        weight_col <- '1'
        message('Calculating unweighted prevalences')
    }
    if(is.null(id_col))
        id_col <- '1'
    
    options(survey.lonely.psu = "adjust")

    if(!is.null(strata_col)) {
        # Convert column to factor
        df[[strata_col]] <- factor(df[[strata_col]])
        # Reformat into formula
        strata_col <- as.formula(paste0('~', strata_col))
    }
    survey_design <- svydesign(
        ids = as.formula(paste0('~', id_col)),
        strata = strata_col,
        weights = as.formula(paste0('~', weight_col)), 
        data = df,
        nest = TRUE
    )
    
    # prevalence by sub-populations
    prevalence_by_subgroups_list <- list()

    for (group_var in group_vars) {
      prevalence_by_subgroups_list[[group_var]] <- list()

      for (var in vars) {
        form <- as.formula(paste0("~", var))
        group <- as.formula(paste0("~", group_var))

        result <- svyby(form, group, survey_design, svymean, na.rm = TRUE)

        prevalence_by_subgroups_list[[group_var]][[var]] <- result
      }
    }

    # compile results in one dataset
    processed_list <- list()

    for (group_var in names(prevalence_by_subgroups_list)) {
      for (var in names(prevalence_by_subgroups_list[[group_var]])) {

        df <- prevalence_by_subgroups_list[[group_var]][[var]]
        df_selected <- df %>%
          select(1, ends_with("1"))  

        colnames(df_selected)[1:3] <- c("level_var","WMEAN", "SEMEAN")

        df_selected$outcome_var <- var
        df_selected$group_var <- group_var

        processed_list[[paste(group_var, var, sep = "_")]] <- df_selected
      }
    }

    outcome_df <- bind_rows(processed_list) %>%
      select(outcome_var,group_var,level_var, WMEAN, SEMEAN)
    row.names(outcome_df) <- NULL
    
    if(weight_col == '1'){
        # Drop standard errors from unweighted to avoid confusion
        outcome_df$SEMEAN <- NA
    }
    
    if(!is.null(label))
        outcome_df$Method <- label
    
    return(outcome_df)
}

