Books
Same Things, Different Words
\[ y = a + bx \]
\[ y = a + bx \]
| Machine Learning | Deep Learning |
|---|---|
| Intercept | Bias |
\[ y = a + bx \]
| Machine Learning | Deep Learning |
|---|---|
| Intercept | Bias |
| Coefficients | Weights |
\[ \frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + 1} \]
\[ \frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + 1} \]
| Machine Learning | Deep Learning |
|---|---|
| Inverse Logit | Sigmoid |
\[ \|x\|_2^2 = x^Tx = \sum_i |x_i|^2 \]
\[ \|x\|_2^2 = x^Tx = \sum_i |x_i|^2 \]
| Machine Learning | Deep Learning |
|---|---|
| Ridge | Weight Decay |
\[ \hat{y} = \hat{f}(\tilde{x}) \]
\[ \hat{y} = \hat{f}(\tilde{x}) \]
| Machine Learning | Deep Learning |
|---|---|
| Prediction | Inference |
\[ y = f(x) \]
Machine Learning
x_train <- build.x(modelFormula, data=data,
contrasts=FALSE, sparse=TRUE)
y_train <- build.y(modelFormula, data=data) %>% as.integer() - 1
x_val <- build.x(modelFormula, data=validate,
contrasts=FALSE, sparse=TRUE)
y_val <- build.y(modelFormula, data=validate) %>% as.integer() - 1
mod_glmnet <- cv.glmnet(x=x_train, y=y_train, family='binomial', nfolds=10)
coefpath(mod_glmnet)
coefplot(mod_glmnet, lambda='lambda.min', sort='magnitude')
xg_train <- xgb.DMatrix(data=x_train, label=y_train) xg_val <- xgb.DMatrix(data=x_val, label=y_val)
model_xgboost <- xgb.train(
data=xg_train,
booster='gbtree', objective='binary:logistic',
nrounds=500,
watchlist=c(train=xg_train, validate=xg_val),
early_stopping_rounds=50, verbose=FALSE
)
xgb.plot.importance(xgb.importance(model_xgboost, feature_names=colnames(xg_train)))
xgb.plot.multi.trees(model_xgboost, feature_names=colnames(xg_train))
caret_glmnet <- train(modelFormula,
data=data,
method="glmnet",
trControl=controls)
caret_xgboost <- train(modelFormula,
data=data,
method="xgbTree",
trControl=controls)
Deep Learning
Extreme Non-Linear Modeling
x_train <- build.x(modelFormula, data=data,
contrasts=FALSE, sparse=FALSE)
y_train <- build.y(modelFormula, data=data) %>% as.integer() - 1
x_val <- build.x(modelFormula, data=validate,
contrasts=FALSE, sparse=FALSE)
y_val <- build.y(modelFormula, data=validate) %>% as.integer() - 1
net_mxnet <- mx.symbol.Variable('data') %>%
# drop out 20% of predictors
mx.symbol.Dropout(p=0.2, name='Predictor_Dropout') %>%
# a fully connected layer with 256 units
mx.symbol.FullyConnected(num_hidden=256, name='fc_1') %>%
# use the rectified linear unit (relu) for the activation function
mx.symbol.Activation(act_type='relu', name='relu_1') %>%
# drop out 50% of the units
mx.symbol.Dropout(p=0.5, name='dropout_1') %>%
# a fully connected layer with 128 units
mx.symbol.FullyConnected(num_hidden=128, name='fc_2') %>%
# use the rectified linear unit (relu) for the activation function
mx.symbol.Activation(act_type='relu', name='relu_2') %>%
# drop out 50% of the units
mx.symbol.Dropout(p=0.5, name='dropout_2') %>%
# fully connect to the output layer which has just the 1 unit
mx.symbol.FullyConnected(num_hidden=1, name='out') %>%
# use the sigmoid output
mx.symbol.LogisticRegressionOutput(name='output')
# custom log-loss function
mx.metric.logloss <- mx.metric.custom("logloss", function(label, pred){
return(Metrics::logLoss(label, pred))
})
# validation logger logger <- mx.metric.logger$new()
# train the model
mod_mxnet <- mx.model.FeedForward.create(
symbol = net_mxnet, # the symbolic network
X = x_train, # the predictors
y = y_train, # the response
optimizer = "adam", # using the Adam optimization method
eval.data = list(data=x_val, label=y_val), # validation data
ctx = mx.cpu(), # use the cpu for training
eval.metric = mx.metric.logloss, # evaluate with log-loss
num.round = 50, # 50 epochs
learning.rate = 0.001, # learning rate
array.batch.size = 128, # batch size
array.layout = "rowmajor", # the data is stored in row major format
epoch.end.callback= mx.callback.log.train.metric(1, logger),
verbose = FALSE
)
mod_mxnet_eval <- tibble::tibble(Epoch=seq_along(logger$train),
Train=logger$train, Validate=logger$eval)
dygraphs::dygraph(mod_mxnet_eval, elementId='mxnetEval')
net_keras <- keras_model_sequential() %>%
# fully connected layer
layer_dense(units = 512, activation = 'relu',
input_shape=dim(x_train)[[-1]], name='fc1') %>%
# batch normalization
layer_batch_normalization(name='batchnorm1') %>%
# dropout
layer_dropout(rate=0.5, name='dropout1') %>%
# fully connected layer
layer_dense(units=256, activation='relu', name='fc2') %>%
# batch normalization
layer_batch_normalization(name='batchnorm2') %>%
# dropout
layer_dropout(rate=0.5, name='dropout2') %>%
# output layer
layer_dense(units=1, activation="sigmoid", name='out')
net_keras %>% compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = c("accuracy")
)
history <- net_keras %>%
fit(x=x_train, y=y_train,
epochs=10,
batch_size=128,
validation_data=list(x_val, y_val),
verbose=FALSE, view_metrics=TRUE,
callbacks=list(
callback_early_stopping(monitor='val_acc',
patience=5),
callback_model_checkpoint(filepath='model_deep.h5',
monitor='val_loss',
save_best_only=TRUE),
callback_reduce_lr_on_plateau(monitor='val_loss',
factor=0.1,
patience=3),
callback_tensorboard("logs/run_a")
)
)
mod_keras_eval <- tibble::tibble(Epoch=seq_along(history$metrics$loss),
Train=history$metrics$loss,
Validate=history$metrics$val_loss)
dygraphs::dygraph(mod_keras_eval, elementId='kerasEval')
Wrapping Up
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| One-Line Functions | Much More Code |
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| One-Line Functions | Much More Code |
caret
|
caret-lite
|
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| One-Line Functions | Much More Code |
caret
|
caret-lite
|
| Some Explanation | Black Box |
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| One-Line Functions | Much More Code |
caret
|
caret-lite
|
| Some Explanation | Black Box |
| Strong Predictions | Better Predictions? |
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| One-Line Functions | Much More Code |
caret
|
caret-lite
|
| Some Explanation | Black Box |
| Strong Predictions | Better Predictions? |
| Great Books | Great Books |
| Machine Learning | Deep Learning |
|---|---|
| Centuries Old Terms | New Terms |
| Many Packages | Handful of Packages |
| One-Line Functions | Much More Code |
caret
|
caret-lite
|
| Some Explanation | Black Box |
| Strong Predictions | Better Predictions? |
| Great Books | Great Books |
| A Ton of Excitement | A Ton of Excitement |
Thank You