head(training_set)
type year make model trim mileage price
USED 2007 Honda Accord EX-L V-6 96715 12950
USED 2006 Honda Accord EX V-6 102644 10998
USED 2005 Honda Accord LX 111639 7688
USED 2001 Honda Accord EX 103771 5788
USED 2008 Honda Accord EX-L V-6 39029 16998
USED 2012 Honda Accord EX-L V-6 40131 19888

p <- ggplot(data = training_set, 
            aes(x = mileage, y = price)) +
  geom_point()
p


p + geom_smooth(method = "lm", 
                formula = y ~ 1 + x)


rmse <- function(pred, value) {
  sqrt(mean((pred - value)^2))
}

model_1 <- lm(price ~ 1 + mileage,
              data = training_set)
rmse(predict(model_1, training_set), training_set$price)
## [1] 2355

p + geom_smooth(method = "lm", 
                formula = y ~ 1 + x + I(x^2))


model_2 <- lm(price ~ 1 + mileage + I(mileage^2),
              data = training_set)
rmse(predict(model_2, training_set), training_set$price)
## [1] 2331

p + geom_smooth(method = "lm", 
                formula = y ~ 1 + x + I(x^2) + I(x^3) +
                  I(x^4) + I(x^5))


model_5 <- lm(price ~ 1 + mileage + I(mileage^2) +
              I(mileage^3) + I(mileage^4) + I(mileage^5),
              data = training_set)
rmse(predict(model_5, training_set), training_set$price)
## [1] 2142

Training vs. test error

c(
  rmse(predict(model_1, training_set), training_set$price),
  rmse(predict(model_1, test_set), test_set$price)
)
## [1] 2355 2464
c(
  rmse(predict(model_2, training_set), training_set$price),
  rmse(predict(model_2, test_set), test_set$price)
)
## [1] 2331 2337
c(
  rmse(predict(model_5, training_set), training_set$price),
  rmse(predict(model_5, test_set), test_set$price)
)
## [1] 2142 4715

p + geom_smooth(method = "lm", 
                formula = y ~ 1 + x + I(x^2) + I(x^3) +
                  I(x^4) + I(x^5)) +
  geom_point(data = test_set, color = "red")