head(training_set)
USED |
2007 |
Honda |
Accord |
EX-L V-6 |
96715 |
12950 |
USED |
2006 |
Honda |
Accord |
EX V-6 |
102644 |
10998 |
USED |
2005 |
Honda |
Accord |
LX |
111639 |
7688 |
USED |
2001 |
Honda |
Accord |
EX |
103771 |
5788 |
USED |
2008 |
Honda |
Accord |
EX-L V-6 |
39029 |
16998 |
USED |
2012 |
Honda |
Accord |
EX-L V-6 |
40131 |
19888 |
p <- ggplot(data = training_set,
aes(x = mileage, y = price)) +
geom_point()
p
p + geom_smooth(method = "lm",
formula = y ~ 1 + x)
rmse <- function(pred, value) {
sqrt(mean((pred - value)^2))
}
model_1 <- lm(price ~ 1 + mileage,
data = training_set)
rmse(predict(model_1, training_set), training_set$price)
## [1] 2355
p + geom_smooth(method = "lm",
formula = y ~ 1 + x + I(x^2))
model_2 <- lm(price ~ 1 + mileage + I(mileage^2),
data = training_set)
rmse(predict(model_2, training_set), training_set$price)
## [1] 2331
p + geom_smooth(method = "lm",
formula = y ~ 1 + x + I(x^2) + I(x^3) +
I(x^4) + I(x^5))
model_5 <- lm(price ~ 1 + mileage + I(mileage^2) +
I(mileage^3) + I(mileage^4) + I(mileage^5),
data = training_set)
rmse(predict(model_5, training_set), training_set$price)
## [1] 2142
Training vs. test error
c(
rmse(predict(model_1, training_set), training_set$price),
rmse(predict(model_1, test_set), test_set$price)
)
## [1] 2355 2464
c(
rmse(predict(model_2, training_set), training_set$price),
rmse(predict(model_2, test_set), test_set$price)
)
## [1] 2331 2337
c(
rmse(predict(model_5, training_set), training_set$price),
rmse(predict(model_5, test_set), test_set$price)
)
## [1] 2142 4715
p + geom_smooth(method = "lm",
formula = y ~ 1 + x + I(x^2) + I(x^3) +
I(x^4) + I(x^5)) +
geom_point(data = test_set, color = "red")