Data Science: Regression, Classification, and Model Evaluation

Verified

Added on  2021/06/16

|20
|3180
|36
Homework Assignment
AI Summary
This assignment delves into various aspects of data science, encompassing regression analysis, classification models, and model evaluation techniques. The first part focuses on regression using splines, exploring model fitting, plotting, and cross-validation to assess model performance. The second part shifts to classification, utilizing decision trees, naive Bayes, and random forests to predict a binary outcome. The third part explores model selection and evaluation using cross-validation and lasso regression to refine the model's accuracy. The assignment includes code implementations, model summaries, and performance metrics to compare and contrast different approaches, providing a comprehensive understanding of data science methodologies. This assignment provides a practical exploration of various data science techniques including regression, classification, and model evaluation, offering a comprehensive understanding of data science methodologies.
Document Page
Question 1
1-A
library(splines)
library(ISLR)
## Loading required package: splines
## Loading required package: ISLR
attach(College)
collegexpend<-quantile(Expend)
expd<-seq(from=collegexpend[1],
to=collegexpend[2])
ft<-lm(Outstate ~ bs(Expend ,knots=c(25,50,75)), data=College)
summary(ft)
##
## Call:
## lm(formula = Outstate ~ bs(Expend, knots = c(25, 50, 75)), data =
College)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11428.4 -1513.1 199.9 1722.1 5932.1
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value
Pr(>|t|)
## (Intercept) 18020 2292 7.861
1.28e-14
## bs(Expend, knots = c(25, 50, 75))1 NA NA NA
NA
## bs(Expend, knots = c(25, 50, 75))2 NA NA NA
NA
## bs(Expend, knots = c(25, 50, 75))3 -14308 2378 -6.018
2.72e-09
## bs(Expend, knots = c(25, 50, 75))4 10713 2125 5.042
5.75e-07
## bs(Expend, knots = c(25, 50, 75))5 -4173 4287 -0.973
0.331
## bs(Expend, knots = c(25, 50, 75))6 NA NA NA
NA
##
## (Intercept) ***
## bs(Expend, knots = c(25, 50, 75))1
## bs(Expend, knots = c(25, 50, 75))2
## bs(Expend, knots = c(25, 50, 75))3 ***
tabler-icon-diamond-filled.svg

Paraphrase This Document

Need a fresh take? Get an instant paraphrase of this document with our AI Paraphraser
Document Page
## bs(Expend, knots = c(25, 50, 75))4 ***
## bs(Expend, knots = c(25, 50, 75))5
## bs(Expend, knots = c(25, 50, 75))6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2540 on 773 degrees of freedom
## Multiple R-squared: 0.603, Adjusted R-squared: 0.6014
## F-statistic: 391.3 on 3 and 773 DF, p-value: < 2.2e-16
1-B
#plot
#scatterplot
plot(Expend, Outstate, col="blue", xlab="Expend", ylab="Outstate")
1-C
seed <- 500
set.seed(seed)
gendt <- function(n, beeta, sgma_eps) {
eps <- rnorm(n, 0, sgma_eps)
x <- sort(runif(n, 0, 100))
X <- cbind(1, poly(x, degree = (length(beeta) - 1), raw = TRUE))
y <- as.numeric(X %*% beeta + eps)
Document Page
return(data.frame(x = x, y = y))
}
# Fit the models
library(splines)
nrp <- 100
nndegfr <- 4:7
df <- nndegfr
beeta <- c(6, -0.1, 0.003, -2e-05)
n_trn <- 50
n_tst <- 10000
sgma_eps <- 0.5
vx <- rs <- list()
vx_test <- gendt(n_tst, beeta, sgma_eps)
for (i in 1:nrp) {
vx[[i]] <- gendt(n_trn, beeta, sgma_eps)
x <- vx[[i]][, "x"]
y <- vx[[i]][, "y"]
rs[[i]] <- apply(t(df), 2, function(ndegfr) lm(y ~ ns(x, df =
ndegfr)))
}
rs[[i]]
## [[1]]
##
## Call:
## lm(formula = y ~ ns(x, df = ndegfr))
##
## Coefficients:
## (Intercept) ns(x, df = ndegfr)1 ns(x, df = ndegfr)2
## 5.8434 0.4369 1.8098
## ns(x, df = ndegfr)3 ns(x, df = ndegfr)4
## -0.6729 1.1761
##
##
## [[2]]
##
## Call:
## lm(formula = y ~ ns(x, df = ndegfr))
##
## Coefficients:
## (Intercept) ns(x, df = ndegfr)1 ns(x, df = ndegfr)2
## 5.8853 -0.4793 0.8029
## ns(x, df = ndegfr)3 ns(x, df = ndegfr)4 ns(x, df = ndegfr)5
## 1.5886 -0.6112 1.0120
##
##
Document Page
## [[3]]
##
## Call:
## lm(formula = y ~ ns(x, df = ndegfr))
##
## Coefficients:
## (Intercept) ns(x, df = ndegfr)1 ns(x, df = ndegfr)2
## 5.92146 -0.87600 -0.03049
## ns(x, df = ndegfr)3 ns(x, df = ndegfr)4 ns(x, df = ndegfr)5
## 1.06349 1.31889 -0.58014
## ns(x, df = ndegfr)6
## 0.93388
##
##
## [[4]]
##
## Call:
## lm(formula = y ~ ns(x, df = ndegfr))
##
## Coefficients:
## (Intercept) ns(x, df = ndegfr)1 ns(x, df = ndegfr)2
## 5.8995 -1.1213 -0.2474
## ns(x, df = ndegfr)3 ns(x, df = ndegfr)4 ns(x, df = ndegfr)5
## 0.2952 1.3355 1.0273
## ns(x, df = ndegfr)6 ns(x, df = ndegfr)7
## -0.4409 0.8805
## F-statistic: 391.3 on 3 and 773 DF, p-value: < 2.2e-16
2-A
library (caret)
library(rpart.plot)
library(ISLR)
attach(OJ)
trn1 <- Purchase
indx = sample(1070, 800)
purchase_train = OJ[indx,]
purchase_test = OJ[-indx,]
trgt=Purchase~WeekofPurchase+StoreID+PriceCH+PriceMM+DiscCH+DiscMM+Spe
tabler-icon-diamond-filled.svg

Paraphrase This Document

Need a fresh take? Get an instant paraphrase of this document with our AI Paraphraser
Document Page
cialCH+SalePriceCH+PriceDiff+Store7+PctDiscMM+PctDiscCH+ListPriceDiff+
STORE
tr=rpart(trgt,data=purchase_train,method="class")
rpart.plot(tr)
2-B
library(e1071)
library(caTools)
purchasenb <- naiveBayes(Purchase ~ ., data = purchase_test)
Document Page
Purchsetestprd <- predict(purchasenb, purchase_test[ ,
names(purchase_test) != "Purchase"])
matrx<- confusionMatrix(Purchsetestprd , purchase_test$Purchase)
matrx
Confusion Matrix and Statistics
Reference
Prediction CH MM
CH 128 20
MM 39 83
Accuracy : 0.7815
95% CI : (0.7274, 0.8293)
No Information Rate : 0.6185
P-Value [Acc > NIR] : 7.41e-09
Kappa : 0.5528
Mcnemar's Test P-Value : 0.01911
Sensitivity : 0.7665
Specificity : 0.8058
Pos Pred Value : 0.8649
Neg Pred Value : 0.6803
Prevalence : 0.6185
Detection Rate : 0.4741
Detection Prevalence : 0.5481
Balanced Accuracy : 0.7861
'Positive' Class : CH
Document Page
2-C
library(caret)
data(OJ)
trncntrl <- trainControl(method="LOOCV")
mdl <- train(Purchase~., data=OJ, trControl=trncntrl, method="nb")
print(mdl)
Bayes
1070 samples
17 predictor
2 classes: 'CH', 'MM'
No pre-processing
Resampling: Leave-One-Out Cross-Validation
Summary of sample sizes: 1069, 1069, 1069, 1069, 1069, 1069, ...
Resampling results across tuning parameters:
usekernel Accuracy Kappa
FALSE 0.7570093 0.5033721
TRUE 0.6953271 0.2851493
Tuning parameter 'fL' was held constant at a value of 0
Tuning
parameter 'adjust' was held constant at a value of 1
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were fL = 0, usekernel = FALSE and
adjust
tabler-icon-diamond-filled.svg

Paraphrase This Document

Need a fresh take? Get an instant paraphrase of this document with our AI Paraphraser
Document Page
= 1.
text(trii, pretty = 0)
2-D
library("rpart")
gh<-OJ
optmald<-as.data.frame(OJ)
trii <-
rpart(Purchase~WeekofPurchase+StoreID+PriceCH+PriceMM+DiscCH+DiscMM+Sp
ecialCH+SalePriceCH+PriceDiff+Store7+PctDiscMM+PctDiscCH+ListPriceDiff
+STORE,
data=optmald,
method="class")
plot(trii)
Document Page
2-F
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(trii)
Document Page
2-F
library(randomForest)
set.seed(1)
ranfr <-randomForest(Purchase~.,data=gh, ntree=500)
plot(ranfr)
tabler-icon-diamond-filled.svg

Paraphrase This Document

Need a fresh take? Get an instant paraphrase of this document with our AI Paraphraser
Document Page
2-G
seed <- 500
set.seed(1)
gdt <- function(n, beeta, sgma_es) {
es <- rnorm(n, 0, sgma_es)
x <- sort(runif(n, 0, 100))
X <- cbind(1, poly(x, degree = (length(beeta) - 1), raw = TRUE))
Document Page
y <- as.numeric(X %*% beeta + es)
return(data.frame(x = x, y = y))
}
# Fitting models
library(splines)
n_rep <- 100
n_df <- 4:7
df <- n_df
beeta <- c(8, -0.1, 0.005, 1000)
ntrn <- 50
ntst <- 10000
sgma_es <- 0.5
vx <- rs <- list()
vx_test <- gdt(ntst, beeta, sgma_es)
for (i in 1:n_rep) {
vx[[i]] <- gdt(ntrn, beeta, sgma_es)
x <- vx[[i]][, "x"]
y <- vx[[i]][, "y"]
rs[[i]] <- apply(t(df), 2, function(degf) lm(y ~ ns(x, df =
degf)))
}
pred <- list()
meanse <- te <- matrix(NA, nrow = n_df, ncol = n_rep)
for (i in 1:n_rep) {
meanse[, i] <- sapply(rs[[i]], function(obj)
deviance(obj)/nobs(obj))
pred[[i]] <- mapply(function(obj, degf) predict(obj, data.frame(x
= vx_test$x)),
rs[[i]], df)
te[, i] <- sapply(as.list(data.frame(pred[[i]])), function(y_hat)
mean((vx_test$y -
y_hat)^2))
}
# Evaluate the average training error
averagemse <- rowMeans(meanse)
error <- rowMeans(te)
error
## [1] 2.809198e+13 1.254824e+13 7.549070e+12 4.527987e+12
chevron_up_icon
1 out of 20
circle_padding
hide_on_mobile
zoom_out_icon
[object Object]