Deep Learning vs Machine Learning in R

Books

Same Things, Different Words

\[ y = a + bx \]

Machine Learning	Deep Learning
Intercept	Bias

\[ y = a + bx \]

Machine Learning	Deep Learning
Intercept	Bias
Coefficients	Weights

\[ \frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + 1} \]

Machine Learning	Deep Learning
Inverse Logit	Sigmoid

\[ \|x\|_2^2 = x^Tx = \sum_i |x_i|^2 \]

Machine Learning	Deep Learning
Ridge	Weight Decay

\[ \hat{y} = \hat{f}(\tilde{x}) \]

Machine Learning	Deep Learning
Prediction	Inference

\[ y = f(x) \]

Machine Learning

x_train <- build.x(modelFormula, data=data,
                   contrasts=FALSE, sparse=TRUE)
y_train <- build.y(modelFormula, data=data) %>% as.integer() - 1

x_val <- build.x(modelFormula, data=validate,
                   contrasts=FALSE, sparse=TRUE)
y_val <- build.y(modelFormula, data=validate) %>% as.integer() - 1

mod_glmnet <- cv.glmnet(x=x_train, y=y_train, family='binomial', nfolds=10)

coefpath(mod_glmnet)

coefplot(mod_glmnet, lambda='lambda.min', sort='magnitude')

xg_train <- xgb.DMatrix(data=x_train, label=y_train)
xg_val <- xgb.DMatrix(data=x_val, label=y_val)

model_xgboost <- xgb.train(
    data=xg_train,
    booster='gbtree', objective='binary:logistic',
    nrounds=500, 
    watchlist=c(train=xg_train, validate=xg_val),
    early_stopping_rounds=50, verbose=FALSE
)

xgb.plot.importance(xgb.importance(model_xgboost, feature_names=colnames(xg_train)))

xgb.plot.multi.trees(model_xgboost, feature_names=colnames(xg_train))

caret_glmnet <- train(modelFormula,
                      data=data,
                      method="glmnet",
                      trControl=controls)

caret_xgboost <- train(modelFormula,
                       data=data,
                       method="xgbTree",
                       trControl=controls)

Deep Learning

Extreme Non-Linear Modeling

x_train <- build.x(modelFormula, data=data,
                   contrasts=FALSE, sparse=FALSE)
y_train <- build.y(modelFormula, data=data) %>% as.integer() - 1

x_val <- build.x(modelFormula, data=validate,
                   contrasts=FALSE, sparse=FALSE)
y_val <- build.y(modelFormula, data=validate) %>% as.integer() - 1

net_mxnet <- mx.symbol.Variable('data') %>%
    # drop out 20% of predictors
    mx.symbol.Dropout(p=0.2, name='Predictor_Dropout') %>%
    # a fully connected layer with 256 units
    mx.symbol.FullyConnected(num_hidden=256, name='fc_1') %>%
    # use the rectified linear unit (relu) for the activation function
    mx.symbol.Activation(act_type='relu', name='relu_1') %>%
    # drop out 50% of the units
    mx.symbol.Dropout(p=0.5, name='dropout_1') %>%
    # a fully connected layer with 128 units
    mx.symbol.FullyConnected(num_hidden=128, name='fc_2') %>%
    # use the rectified linear unit (relu) for the activation function
    mx.symbol.Activation(act_type='relu', name='relu_2') %>%
    # drop out 50% of the units
    mx.symbol.Dropout(p=0.5, name='dropout_2') %>%
    # fully connect to the output layer which has just the 1 unit
    mx.symbol.FullyConnected(num_hidden=1, name='out') %>%
    # use the sigmoid output
    mx.symbol.LogisticRegressionOutput(name='output')

# custom log-loss function
mx.metric.logloss <- mx.metric.custom("logloss", function(label, pred){
    return(Metrics::logLoss(label, pred))
})

# validation logger
logger <- mx.metric.logger$new()

# train the model
mod_mxnet <- mx.model.FeedForward.create(
    symbol            = net_mxnet,    # the symbolic network
    X                 = x_train, # the predictors
    y                 = y_train, # the response
    optimizer         = "adam", # using the Adam optimization method
    eval.data         = list(data=x_val, label=y_val), # validation data
    ctx               = mx.cpu(), # use the cpu for training
    eval.metric       = mx.metric.logloss, # evaluate with log-loss
    num.round         = 50,     # 50 epochs
    learning.rate     = 0.001,   # learning rate
    array.batch.size  = 128,    # batch size
    array.layout      = "rowmajor",  # the data is stored in row major format
    epoch.end.callback= mx.callback.log.train.metric(1, logger),
    verbose           = FALSE
)

mod_mxnet_eval <- tibble::tibble(Epoch=seq_along(logger$train),
                               Train=logger$train, Validate=logger$eval)
dygraphs::dygraph(mod_mxnet_eval, elementId='mxnetEval')

net_keras <- keras_model_sequential() %>% 
    # fully connected layer
    layer_dense(units = 512, activation = 'relu', 
                input_shape=dim(x_train)[[-1]], name='fc1') %>% 
    # batch normalization
    layer_batch_normalization(name='batchnorm1') %>% 
    # dropout
    layer_dropout(rate=0.5, name='dropout1') %>% 
    # fully connected layer
    layer_dense(units=256, activation='relu', name='fc2') %>% 
    # batch normalization
    layer_batch_normalization(name='batchnorm2') %>% 
    # dropout
    layer_dropout(rate=0.5, name='dropout2') %>% 
    # output layer
    layer_dense(units=1, activation="sigmoid", name='out')

net_keras %>% compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = c("accuracy")
)

history <- net_keras %>% 
    fit(x=x_train, y=y_train,
        epochs=10,
        batch_size=128,
        validation_data=list(x_val, y_val),
        verbose=FALSE, view_metrics=TRUE,
        callbacks=list(
            callback_early_stopping(monitor='val_acc',
                                    patience=5),
            callback_model_checkpoint(filepath='model_deep.h5',
                                      monitor='val_loss',
                                      save_best_only=TRUE),
            callback_reduce_lr_on_plateau(monitor='val_loss',
                                          factor=0.1,
                                          patience=3),
            callback_tensorboard("logs/run_a")
        )
    )

mod_keras_eval <- tibble::tibble(Epoch=seq_along(history$metrics$loss), 
                                 Train=history$metrics$loss, 
                                 Validate=history$metrics$val_loss)
dygraphs::dygraph(mod_keras_eval, elementId='kerasEval')

Wrapping Up

Machine Learning	Deep Learning
Centuries Old Terms	New Terms

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages
One-Line Functions	Much More Code

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages
One-Line Functions	Much More Code
`caret`	`caret`-lite

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages
One-Line Functions	Much More Code
`caret`	`caret`-lite
Some Explanation	Black Box

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages
One-Line Functions	Much More Code
`caret`	`caret`-lite
Some Explanation	Black Box
Strong Predictions	Better Predictions?

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages
One-Line Functions	Much More Code
`caret`	`caret`-lite
Some Explanation	Black Box
Strong Predictions	Better Predictions?
Great Books	Great Books

Machine Learning	Deep Learning
Centuries Old Terms	New Terms
Many Packages	Handful of Packages
One-Line Functions	Much More Code
`caret`	`caret`-lite
Some Explanation	Black Box
Strong Predictions	Better Predictions?
Great Books	Great Books
A Ton of Excitement	A Ton of Excitement

Thank You

Jared P. Lander

Chief Data Scientist of Lander Analytics
Author of R for Everyone
Adjunct Professor at Columbia University
Organizer of New York Open Statistical Programming (The R) Meetup
Organizer of New York R Conference