Base teórica:
https://jhklarcher.github.io/material/01_regressao_multipla.html
# Bibliotecas
library(ggplot2)
library(ggcorrplot)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
theme_set(theme_light())
Importando os dados.
data(longley)
Verificando as relações entre as variáveis.
pairs(longley, pch = 19)
Calculando a matriz de correlação.
cor(longley[1:6])
## GNP.deflator GNP Unemployed Armed.Forces Population
## GNP.deflator 1.0000000 0.9915892 0.6206334 0.4647442 0.9791634
## GNP 0.9915892 1.0000000 0.6042609 0.4464368 0.9910901
## Unemployed 0.6206334 0.6042609 1.0000000 -0.1774206 0.6865515
## Armed.Forces 0.4647442 0.4464368 -0.1774206 1.0000000 0.3644163
## Population 0.9791634 0.9910901 0.6865515 0.3644163 1.0000000
## Year 0.9911492 0.9952735 0.6682566 0.4172451 0.9939528
## Year
## GNP.deflator 0.9911492
## GNP 0.9952735
## Unemployed 0.6682566
## Armed.Forces 0.4172451
## Population 0.9939528
## Year 1.0000000
ggcorrplot(cor(longley[1:6]))
Regressão com a função lm do R.
modelo1 <- lm(Employed ~ GNP.deflator + GNP + Unemployed + Armed.Forces + Population + Year, data=longley)
summary(modelo1)
##
## Call:
## lm(formula = Employed ~ GNP.deflator + GNP + Unemployed + Armed.Forces +
## Population + Year, data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41011 -0.15767 -0.02816 0.10155 0.45539
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.482e+03 8.904e+02 -3.911 0.003560 **
## GNP.deflator 1.506e-02 8.492e-02 0.177 0.863141
## GNP -3.582e-02 3.349e-02 -1.070 0.312681
## Unemployed -2.020e-02 4.884e-03 -4.136 0.002535 **
## Armed.Forces -1.033e-02 2.143e-03 -4.822 0.000944 ***
## Population -5.110e-02 2.261e-01 -0.226 0.826212
## Year 1.829e+00 4.555e-01 4.016 0.003037 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3049 on 9 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9925
## F-statistic: 330.3 on 6 and 9 DF, p-value: 4.984e-10
Criando o modelo de regressão próprio.
X <- data.matrix(longley[1:6])
X <- cbind(rep(1,nrow(X)),X)
y <- data.matrix(longley[7])
beta <- solve(t(X) %*% X) %*% t(X) %*% y
resid <- (y - X %*% beta)
par(mfrow=c(2,2))
plot(modelo1)
hist(resid, breaks=6, prob = TRUE)
lines(seq(-1, 1, length=100), dnorm(seq(-1, 1, length=100), mean(resid), sd(resid)))
shapiro.test(resid)
##
## Shapiro-Wilk normality test
##
## data: resid
## W = 0.9486, p-value = 0.4679
p-valor = 0.4679. A hipótese de a distribuição ser nomal é aceita.
ks.test(resid, "pnorm", mean(resid), sd(resid))
##
## One-sample Kolmogorov-Smirnov test
##
## data: resid
## D = 0.1723, p-value = 0.6672
## alternative hypothesis: two-sided
p-valor = 0.6672. A hipótese de a distribuição ser nomal é aceita.
library(car)
## Loading required package: carData
vif(modelo1)
## GNP.deflator GNP Unemployed Armed.Forces Population Year
## 135.53244 1788.51348 33.61889 3.58893 399.15102 758.98060
k <- ncol(X)-1
p <- k + 1
n <- nrow(X)
SS_R <- t(beta) %*% t(X) %*% y - (sum(y)^2)/n
SS_E <- (t(resid) %*% resid)
SS_T <- t(y) %*% y - (sum(y)^2)/n
MS_R <- SS_R/k
MS_E <- SS_E/(n-p)
F_0 <- MS_R/MS_E
F_CR <- qf(.95, df1=k, df2=(n-p))
p_val <- pf(F_0, df1=k, df2=(n-p), lower.tail=F)
\(H_0: \beta_0 = \beta_1 = ... = \beta_k = 0\).
\(H_1:\beta_j\neq0\), para pelomenos um j.
Como \(F_0 > F_{CR}\) a hipótese nula é rejeitada.
R2 <- SS_R/SS_T
R2_ajust <- 1 - (SS_E * (n-1))/(SS_T * (n-p))
sigma_chapeu <- SS_E/(n-p)
C <- solve(t(X) %*% X)
T_CR <- qt(.025, df=(n-p), lower.tail = F)
T_0 <- vector()
p_val_t <- vector()
SE <- vector()
for(i in 1:7) {
T_0[i] <- abs(beta[i]/(sqrt(sigma_chapeu*C[i,i])))
p_val_t[i] <- pt(T_0[i], df=(n-p), lower.tail=F)*2
SE[i] <- sqrt(sigma_chapeu*C[i,i])
}
RSE <- sqrt(MS_E)
sumario <- function(X, y) {
# Cálculo de beta e dos resíduos
beta <- solve(t(X) %*% X) %*% t(X) %*% y
resid <- (y - X %*% beta)
#Cálculo dos graus de liberdade e ANOVA
k <- ncol(X)-1
p <- k + 1
n <- nrow(X)
SS_R <- t(beta) %*% t(X) %*% y - (sum(y)^2)/n
SS_E <- (t(resid) %*% resid)
SS_T <- t(y) %*% y - (sum(y)^2)/n
MS_R <- SS_R/k
MS_E <- SS_E/(n-p)
F_0 <- MS_R/MS_E # Estatística F
#F_CR <- qf(.95, df1=k, df2=(n-p))
p_val <- pf(F_0, df1=k, df2=(n-p), lower.tail=F) # P-valor da anova
R2 <- SS_R/SS_T
R2_ajust <- 1 - (SS_E * (n-1))/(SS_T * (n-p))
sigma_chapeu <- SS_E/(n-p)
C <- solve(t(X) %*% X)
T_0 <- vector() # Estatística T dos coef.
p_val_t <- vector() # P-val dos testes t dos coef
SE <- vector() # Erro quadrático dos coef
for(i in 1:ncol(X)) {
T_0[i] <- abs(beta[i]/(sqrt(sigma_chapeu*C[i,i])))
p_val_t[i] <- pt(T_0[i], df=(n-p), lower.tail=F)*2
SE[i] <- sqrt(sigma_chapeu*C[i,i])
}
RSE <- sqrt(MS_E) # Erro resídual padrão
# cria os data-frames que serão impressos pela função
rownames(beta)[1] <- "Intercessão"
coeficientes <- as.data.frame(rownames(beta))
coeficientes <- cbind(coeficientes,beta, SE, T_0, p_val_t)
colnames(coeficientes) <-c("Variável", "Valor", "Err. Padr.", "t", "p-val")
# Imprime o sumário
cat("\nResíduos:\n")
prmatrix(t(quantile(resid)), rowlab=rep("", nrow(t(quantile(resid)))))
cat("\n")
prmatrix(coeficientes, rowlab=rep("", nrow(coeficientes)), quote = F)
cat("\nErro padrão residual:", RSE, "com", (n-p), "graus de liberdade.")
cat("\nR-quadrado:", R2, "e R-quadrado ajustado", R2_ajust, ".")
cat("\nEstatística-F:", F_0[1,1], "com", (n-p), "e",(k), "GL, p-val:", p_val, ".\n")
}
modelo2 <- lm(Employed ~ Unemployed + Armed.Forces + Year, data=longley)
summary(modelo2)
##
## Call:
## lm(formula = Employed ~ Unemployed + Armed.Forces + Year, data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57285 -0.11989 0.04087 0.13979 0.75303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.797e+03 6.864e+01 -26.183 5.89e-12 ***
## Unemployed -1.470e-02 1.671e-03 -8.793 1.41e-06 ***
## Armed.Forces -7.723e-03 1.837e-03 -4.204 0.00122 **
## Year 9.564e-01 3.553e-02 26.921 4.24e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3321 on 12 degrees of freedom
## Multiple R-squared: 0.9928, Adjusted R-squared: 0.9911
## F-statistic: 555.2 on 3 and 12 DF, p-value: 3.916e-13
X2 <- data.matrix(cbind(longley[3:4], longley[6]))
X2 <- cbind(rep(1,nrow(X2)),X2)
y2 <- data.matrix(longley[7])
beta2 <- solve(t(X2) %*% X2) %*% t(X2) %*% y2
resid2 <- (y2 - X2 %*% beta2)
sumario(X2, y2)
##
## Resíduos:
## 0% 25% 50% 75% 100%
## -0.5728522 -0.1198888 0.04087275 0.1397908 0.7530348
##
## Variável Valor Err. Padr. t p-val
## Intercessão -1.797221e+03 68.641552627 26.182699 5.891622e-12
## Unemployed -1.469671e-02 0.001671374 8.793189 1.410344e-06
## Armed.Forces -7.722815e-03 0.001837150 4.203694 1.223839e-03
## Year 9.563798e-01 0.035524822 26.921453 4.241120e-12
##
## Erro padrão residual: 0.3320844 com 12 graus de liberdade.
## R-quadrado: 0.992847 e R-quadrado ajustado 0.9910588 .
## Estatística-F: 555.209 com 12 e 3 GL, p-val: 3.915929e-13 .
k <- ncol(X2)-1
p <- k + 1
n <- nrow(X2)
SS_E <- (t(resid) %*% resid)
sigma_chapeu <- SS_E/(n-p)
H <- X2 %*% solve(t(X2) %*% X2) %*% t(X2)
r <- vector()
D <- vector()
for(i in 1:nrow(X2)) {
r[i] <- resid2[i]/sqrt(sigma_chapeu^2*(1-H[i,i]))
D[i] <- r[i]^2 * H[i,i]/(p*(1-H[i,i]))
}
p <- ggplot() +
geom_point(aes(longley$Year, D, color=(D<1)), size=2) +
scale_color_manual(values=c("Red", "darkgreen"))
ggplotly(p)
Valores acima de 1 são considerados pontos influêntes.
\[ \boldsymbol{\hat{y}} = \boldsymbol{X\hat{\beta}} \] Visualizando as previsões com relação à variável ano:
y_hat <- X2 %*% beta2
y_hat <- cbind(as.data.frame(y_hat), longley$Year, longley$Employed)
colnames(y_hat) <-c("y_hat", "Year", "y")
p <- ggplot() +
geom_line(data=y_hat, aes(Year, y, group="Valor", col="Valor"), size=1) +
geom_point(data=y_hat, aes(Year, y_hat, group="Previsão", col="Previsão"), size=1.5) +
ylab("Employed") +
xlab("Year") +
theme(legend.title=element_blank()) +
scale_color_manual(values=c("Black", "Red"))
ggplotly(p)