## ----setup, include = FALSE------------------------------------------------------------------------------------------------------------------------- library(knitr) opts_chunk$set( comment = "", fig.width = 12, message = FALSE, warning = FALSE, tidy.opts = list( keep.blank.line = TRUE, width.cutoff = 150 ), options(width = 150), eval = TRUE ) ## ----eval=FALSE------------------------------------------------------------------------------------------------------------------------------------- # install.packages('FSelectorRcpp') # stable release version on CRAN # devtools::install_github('mi2-warsaw/FSelectorRcpp') # dev version # # windows users should have Rtools for devtools installation # # https://cran.r-project.org/bin/windows/Rtools/ ## --------------------------------------------------------------------------------------------------------------------------------------------------- library(magrittr) library(FSelectorRcpp) ## --------------------------------------------------------------------------------------------------------------------------------------------------- information_gain( # Calculate the score for each attribute formula = Species ~ ., # that is on the right side of the formula. data = iris, # Attributes must exist in the passed data. type = "infogain" # Choose the type of a score to be calculated. ) %>% cut_attrs( # Then take attributes with the highest rank. k = 2 # For example: 2 attrs with the higehst rank. ) %>% to_formula( # Create a new formula object with attrs = ., # the most influencial attrs. class = "Species" ) %>% glm( formula = ., # Use that formula in any classification algorithm. data = iris, family = "binomial" ) ## --------------------------------------------------------------------------------------------------------------------------------------------------- evaluator_R2_lm <- # Create a scorer function. function( attributes, # That takes the currently considered subset of attributes data, # from a specified dataset. dependent = names(data)[1] # To find features that best describe the dependent variable. ) { summary( # In this situation we take the r.squared statistic lm( # from the summary of a linear model object. to_formula( # This is the score to use to choose between considered attributes, # subsets of attributes. dependent ), data = data) )$r.squared } feature_search( # feature_search work in 2 modes - 'exhaustive' and 'greedy' attributes = names(iris)[-1], # It takes attribues and creates combinations of it's subsets. fun = evaluator_R2_lm, # And it calculates the score of a subset that depends on the data = iris, # evaluator function passed in the `fun` parameter. mode = "exhaustive", # exhaustive - means to check all possible sizes = # attributes' subset combinations 1:length(attributes) # of sizes passed in sizes. )$all