#
# µû¶óÇÏ¸ç ¹è¿ì´Â ÆÄÀ̽ã°ú µ¥ÀÌÅÍ°úÇÐ(»ý´ÉÃâÆÇ»ç 2020)
# 14.25 °£´ÜÇÑ È¸±Í¸ðµ¨À» ¸¸µéÀÚ, 395ÂÊ
#
import pandas as pd
import seaborn as sns # ½Ã°¢È¸¦ À§ÇÏ¿© Seaborn ¶óÀ̺귯¸®¸¦ ÀÌ¿ëÇÔ
import matplotlib.pyplot as plt
import numpy as np
life = pd.read_csv('life_expectancy.csv')
life.dropna(inplace = True)
X = life[['Alcohol', 'Percentage expenditure', 'Polio',
'BMI', 'GDP', 'Thinness 1-19 years']]
y = life['Life expectancy']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_test_predict = lin_model.predict(X_test)
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_test_predict))
print('RMSE =', rmse)
plt.scatter(y_test, y_test_predict)
plt.show()