# -*- coding: utf-8 -*- """ Example on how to use the GPBoost algorithm for panel data @author: Fabio Sigrist """ import gpboost as gpb from statsmodels.datasets import grunfeld import numpy as np import matplotlib.pyplot as plt plt.style.use('ggplot') # Load data data = grunfeld.load_pandas().data # Visualize response variable plt.hist(data['invest'], bins=50) plt.title("Histogram of response variable") """ Boosting with two crossed firm and year grouped random effects """ # Define random effects model (assuming firm and year random effects) gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) # Create dataset for gpb.train data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) # Specify boosting parameters as dict # Note: no attempt has been done to optimaly choose tuning parameters params = { 'objective': 'regression_l2', 'learning_rate': 1, 'max_depth': 6, 'min_data_in_leaf': 1, 'verbose': 0 } # Train GPBoost model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=1800) # Estimated random effects model (variances of random effects) gp_model.summary() # Cross-validation for determining number of boosting iterations gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, use_gp_model_for_validation=True, num_boost_round=5000, early_stopping_rounds=5, nfold=2, verbose_eval=True, show_stdv=False, seed=1) print("Best number of iterations: " + str(np.argmin(cvbst['l2-mean']))) """ Linear mixed effecst model with two crossed firm and year grouped random effects """ lin_gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) # Add interecept for linear model X = data[['value', 'capital']] X['intercept'] = 1 lin_gp_model.fit(y=data['invest'], X=X, params={"std_dev": True}) lin_gp_model.summary() """ Boosting with grouped firm random effects and one "global" AR(1) year random effect I.e., all firms share the same temporal AR(1) effect. """ gp_model_ar1 = gpb.GPModel(group_data=data['firm'], gp_coords=data['year'], cov_function="exponential") data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) # Train GPBoost model (takes a few seconds) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model_ar1, num_boost_round=1800) # Estimated random effects model (variances of random effects and range parameters) gp_model_ar1.summary() cov_pars = gp_model_ar1.get_cov_pars() phi_hat = np.exp(-1 / cov_pars['GP_range'][0]) sigma2_hat = cov_pars['GP_var'][0] * (1. - phi_hat ** 2) print("Estimated innovation variance and AR(1) coefficient of year effect:") print([sigma2_hat ,phi_hat]) """ Boosting with grouped firm random effects and separate AR(1) year random effects per firm I.e., every firms has its own temporal AR(1) effect. This can be done using the 'cluster_ids' parameter. """ gp_model_ar1 = gpb.GPModel(group_data=data['firm'], gp_coords=data['year'], cluster_ids=data['firm'], cov_function="exponential") # Need to use the more robust option gradient_descent instead of fisher_scoring in this example gp_model_ar1.set_optim_params(params={"optimizer_cov": "gradient_descent"}) data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) # Train GPBoost model (takes a few seconds) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model_ar1, num_boost_round=1800) # Estimated random effects model (variances of random effects and range parameters) gp_model_ar1.summary() cov_pars = gp_model_ar1.get_cov_pars() phi_hat = np.exp(-1 / cov_pars['GP_range'][0]) sigma2_hat = cov_pars['GP_var'][0] * (1. - phi_hat ** 2) print("Estimated innovation variance and AR(1) coefficient of year effect:") print([sigma2_hat ,phi_hat])