Books

Same Things, Different Words

\[ y = a + bx \]

\[ y = a + bx \]

Machine Learning Deep Learning
Intercept Bias

\[ y = a + bx \]

Machine Learning Deep Learning
Intercept Bias
Coefficients Weights

\[ \frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + 1} \]

\[ \frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + 1} \]

Machine Learning Deep Learning
Inverse Logit Sigmoid

\[ \|x\|_2^2 = x^Tx = \sum_i |x_i|^2 \]

\[ \|x\|_2^2 = x^Tx = \sum_i |x_i|^2 \]

Machine Learning Deep Learning
Ridge Weight Decay

\[ \hat{y} = \hat{f}(\tilde{x}) \]

\[ \hat{y} = \hat{f}(\tilde{x}) \]

Machine Learning Deep Learning
Prediction Inference

\[ y = f(x) \]

Machine Learning

x_train <- build.x(modelFormula, data=data,
                   contrasts=FALSE, sparse=TRUE)
y_train <- build.y(modelFormula, data=data) %>% as.integer() - 1

x_val <- build.x(modelFormula, data=validate,
                   contrasts=FALSE, sparse=TRUE)
y_val <- build.y(modelFormula, data=validate) %>% as.integer() - 1

mod_glmnet <- cv.glmnet(x=x_train, y=y_train, family='binomial', nfolds=10)

coefpath(mod_glmnet)

coefplot(mod_glmnet, lambda='lambda.min', sort='magnitude')

xg_train <- xgb.DMatrix(data=x_train, label=y_train)
xg_val <- xgb.DMatrix(data=x_val, label=y_val)

model_xgboost <- xgb.train(
    data=xg_train,
    booster='gbtree', objective='binary:logistic',
    nrounds=500, 
    watchlist=c(train=xg_train, validate=xg_val),
    early_stopping_rounds=50, verbose=FALSE
)

xgb.plot.importance(xgb.importance(model_xgboost, feature_names=colnames(xg_train)))

xgb.plot.multi.trees(model_xgboost, feature_names=colnames(xg_train))

caret_glmnet <- train(modelFormula,
                      data=data,
                      method="glmnet",
                      trControl=controls)
caret_xgboost <- train(modelFormula,
                       data=data,
                       method="xgbTree",
                       trControl=controls)

Deep Learning

Extreme Non-Linear Modeling

x_train <- build.x(modelFormula, data=data,
                   contrasts=FALSE, sparse=FALSE)
y_train <- build.y(modelFormula, data=data) %>% as.integer() - 1

x_val <- build.x(modelFormula, data=validate,
                   contrasts=FALSE, sparse=FALSE)
y_val <- build.y(modelFormula, data=validate) %>% as.integer() - 1

net_mxnet <- mx.symbol.Variable('data') %>%
    # drop out 20% of predictors
    mx.symbol.Dropout(p=0.2, name='Predictor_Dropout') %>%
    # a fully connected layer with 256 units
    mx.symbol.FullyConnected(num_hidden=256, name='fc_1') %>%
    # use the rectified linear unit (relu) for the activation function
    mx.symbol.Activation(act_type='relu', name='relu_1') %>%
    # drop out 50% of the units
    mx.symbol.Dropout(p=0.5, name='dropout_1') %>%
    # a fully connected layer with 128 units
    mx.symbol.FullyConnected(num_hidden=128, name='fc_2') %>%
    # use the rectified linear unit (relu) for the activation function
    mx.symbol.Activation(act_type='relu', name='relu_2') %>%
    # drop out 50% of the units
    mx.symbol.Dropout(p=0.5, name='dropout_2') %>%
    # fully connect to the output layer which has just the 1 unit
    mx.symbol.FullyConnected(num_hidden=1, name='out') %>%
    # use the sigmoid output
    mx.symbol.LogisticRegressionOutput(name='output')

# custom log-loss function
mx.metric.logloss <- mx.metric.custom("logloss", function(label, pred){
    return(Metrics::logLoss(label, pred))
})
# validation logger
logger <- mx.metric.logger$new()

# train the model
mod_mxnet <- mx.model.FeedForward.create(
    symbol            = net_mxnet,    # the symbolic network
    X                 = x_train, # the predictors
    y                 = y_train, # the response
    optimizer         = "adam", # using the Adam optimization method
    eval.data         = list(data=x_val, label=y_val), # validation data
    ctx               = mx.cpu(), # use the cpu for training
    eval.metric       = mx.metric.logloss, # evaluate with log-loss
    num.round         = 50,     # 50 epochs
    learning.rate     = 0.001,   # learning rate
    array.batch.size  = 128,    # batch size
    array.layout      = "rowmajor",  # the data is stored in row major format
    epoch.end.callback= mx.callback.log.train.metric(1, logger),
    verbose           = FALSE
)

mod_mxnet_eval <- tibble::tibble(Epoch=seq_along(logger$train),
                               Train=logger$train, Validate=logger$eval)
dygraphs::dygraph(mod_mxnet_eval, elementId='mxnetEval')

net_keras <- keras_model_sequential() %>% 
    # fully connected layer
    layer_dense(units = 512, activation = 'relu', 
                input_shape=dim(x_train)[[-1]], name='fc1') %>% 
    # batch normalization
    layer_batch_normalization(name='batchnorm1') %>% 
    # dropout
    layer_dropout(rate=0.5, name='dropout1') %>% 
    # fully connected layer
    layer_dense(units=256, activation='relu', name='fc2') %>% 
    # batch normalization
    layer_batch_normalization(name='batchnorm2') %>% 
    # dropout
    layer_dropout(rate=0.5, name='dropout2') %>% 
    # output layer
    layer_dense(units=1, activation="sigmoid", name='out')

net_keras %>% compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = c("accuracy")
)

history <- net_keras %>% 
    fit(x=x_train, y=y_train,
        epochs=10,
        batch_size=128,
        validation_data=list(x_val, y_val),
        verbose=FALSE, view_metrics=TRUE,
        callbacks=list(
            callback_early_stopping(monitor='val_acc',
                                    patience=5),
            callback_model_checkpoint(filepath='model_deep.h5',
                                      monitor='val_loss',
                                      save_best_only=TRUE),
            callback_reduce_lr_on_plateau(monitor='val_loss',
                                          factor=0.1,
                                          patience=3),
            callback_tensorboard("logs/run_a")
        )
    )

mod_keras_eval <- tibble::tibble(Epoch=seq_along(history$metrics$loss), 
                                 Train=history$metrics$loss, 
                                 Validate=history$metrics$val_loss)
dygraphs::dygraph(mod_keras_eval, elementId='kerasEval')

Wrapping Up

Machine Learning Deep Learning
Centuries Old Terms New Terms

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages
One-Line Functions Much More Code

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages
One-Line Functions Much More Code
caret caret-lite

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages
One-Line Functions Much More Code
caret caret-lite
Some Explanation Black Box

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages
One-Line Functions Much More Code
caret caret-lite
Some Explanation Black Box
Strong Predictions Better Predictions?

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages
One-Line Functions Much More Code
caret caret-lite
Some Explanation Black Box
Strong Predictions Better Predictions?
Great Books Great Books

Machine Learning Deep Learning
Centuries Old Terms New Terms
Many Packages Handful of Packages
One-Line Functions Much More Code
caret caret-lite
Some Explanation Black Box
Strong Predictions Better Predictions?
Great Books Great Books
A Ton of Excitement A Ton of Excitement

Thank You

Jared P. Lander