estimate_632_Err = function(DF, x_vars, y_var, B=1000){ ## ## Create a formula for our model and fit it to this data: ## x_pt = paste(x_vars, collapse='+') y_pt = paste(c(y_var,'~'), collapse='') fmla = as.formula(paste(c(y_pt, x_pt), collapse='')) m = lm(fmla, data=DF) ## ## Predict the insample error: ## err_bar = mean(residuals(m)^2) ## ## Generate bootstrap samples: ## N = dim(DF)[1] ## the number of samples in the original dataset SI = sample(N, N*B, replace=TRUE) SI = matrix(SI, nrow=B, ncol=N, byrow=TRUE) ## these are the sample indices that are in each bootstrap replica ## ## Build linear models using each bootstrapped sample: ## all_lms = vector (mode='list', length=B) for( bi in 1:B ){ m = lm(fmla, data=DF[SI[bi, ], ]) all_lms[[bi]] =m } ## ## Estimate Err^{(1)} (this is not optimally coded...) ## err_parts = c() for( ii in 1:N ){ ## ## Find the set C^{(-i)} the set of bootstrap samples that dont contain the observation x_i ## Cminusi = c() for( bi in 1:B ){ if( !(ii %in% SI[bi,]) ){ Cminusi = c(Cminusi, bi) } } if( length(Cminusi)==0 ){ next } ## ## Predict x_i using each of the models in C^{(-i)} ## residuals = c() for(ci in Cminusi){ y_hat = predict(all_lms[[ci]], newdata=DF[ii,]) residuals = c(residuals, DF[ii, y_var] - y_hat) } err_parts = c(err_parts, mean(residuals^2)) } err_1 = mean(err_parts) ## ## Combine: ## estimate = 0.368 * err_bar + 0.632 * err_1 result = list(B=B, err_bar=err_bar, err_1=err_1, estimate=estimate) return(result) }