import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from scipy.stats import kurtosis, skew
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

from IPython.display import Image
Image(url="https://upload.wikimedia.org/wikipedia/commons/thumb/b/b9/CRISP-DM_Process_Diagram.png/800px-CRISP-DM_Process_Diagram.png")

path = "/home/wsl/projetos/mestrado/AEDI/dados/AmesHousing.csv"

data = pd.read_csv(path)
data.head()

def visaogeral(df, messagem):
    print(f'{messagem}:\n')
    print("Qtd Observações:", df.shape[0])
    print("\nQtd Atributos:", df.shape[1])
    print("\nAtributos:")
    print(df.columns.tolist())
    print("\nQtd Valores missing:", df.isnull().sum().values.sum())
    print("\nValores Unicos: indicativo de valores categóricos")
    print(df.nunique().sort_values(ascending=True).head(40))

visaogeral(data,'Visão Geral dataSet treino')

Visão Geral dataSet treino:

Qtd Observações: 2930

Qtd Atributos: 82

Atributos:
['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Pool QC', 'Fence', 'Misc Feature', 'Misc Val', 'Mo Sold', 'Yr Sold', 'Sale Type', 'Sale Condition', 'SalePrice']

Qtd Valores missing: 15749

Valores Unicos: indicativo de valores categóricos
Street            2
Alley             2
Central Air       2
Land Slope        3
Bsmt Half Bath    3
Half Bath         3
Garage Finish     3
Utilities         3
Paved Drive       3
Lot Shape         4
Bsmt Exposure     4
Kitchen AbvGr     4
Mas Vnr Type      4
Land Contour      4
Exter Qual        4
Bsmt Full Bath    4
Pool QC           4
Fence             4
Heating QC        5
Electrical        5
Bsmt Cond         5
Bldg Type         5
Fireplace Qu      5
Kitchen Qual      5
Lot Config        5
Bsmt Qual         5
Misc Feature      5
Yr Sold           5
Garage Qual       5
Full Bath         5
Fireplaces        5
Exter Cond        5
Garage Cond       5
Garage Cars       6
Heating           6
BsmtFin Type 1    6
Roof Style        6
Foundation        6
Garage Type       6
BsmtFin Type 2    6
dtype: int64

data.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF',
       'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Pool QC',
       'Fence', 'Misc Feature', 'Misc Val', 'Mo Sold', 'Yr Sold', 'Sale Type',
       'Sale Condition', 'SalePrice'],
      dtype='object')

# Criar nova variável HouseAge
data["HouseAge"] = 2024 - data["Year Built"]

selected_features = [
    "Gr Liv Area", "Garage Cars", "Total Bsmt SF", "Lot Area", "Overall Qual",
    "Kitchen Qual", "Overall Cond", "Fireplaces", "Wood Deck SF", "Bedroom AbvGr", "TotRms AbvGrd", "HouseAge",
    "Full Bath", "Bldg Type"
]

X = data[selected_features]
y = data["SalePrice"]

def missing_values_table(df):
    mis_val = df.isnull().sum()        
    mis_val_percent = 100 * df.isnull().sum() / len(df)        
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)        
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})        
    mis_val_table_ren_columns = mis_val_table_ren_columns[
    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    
    print ("Seu dataframe tem " + str(df.shape[1]) + " colunas.\n"      
        "Há " + str(mis_val_table_ren_columns.shape[0]) +
            " colunas que possuem valores ausentes.")
    
    return mis_val_table_ren_columns

y.isnull().sum()

np.int64(0)

missing_values_table(X)

Seu dataframe tem 14 colunas.
Há 2 colunas que possuem valores ausentes.

X = X.dropna()
y = y[X.index]

y.describe()

count      2928.000000
mean     180841.033811
std       79889.904415
min       12789.000000
25%      129500.000000
50%      160000.000000
75%      213500.000000
max      755000.000000
Name: SalePrice, dtype: float64

y.quantile([0.80,0.96,0.97])

0.80    230000.00
0.96    354920.00
0.97    377440.06
Name: SalePrice, dtype: float64

plt.figure(figsize=(10, 6))
plt.plot(np.sort(y), label="SalePrice", color='purple')
plt.title("Amplitude de SalePrice")
plt.xlabel("Índice Ordenado")
plt.ylabel("Preço de Venda")
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(y=y, color='cyan')
plt.title("Boxplot de SalePrice")
plt.xlabel("SalePrice")
plt.show()

# Gráfico de QQ-Plot para verificar normalidade
plt.figure(figsize=(10, 6))
sm.qqplot(y, line='s', fit=True)
plt.title("QQ-Plot de SalePrice")
plt.show()

<Figure size 1000x600 with 0 Axes>

# Estatísticas descritivas adicionais
print("Skewness (Assimetria):", skew(y))
print("Kurtosis (Curtose):", kurtosis(y))

Skewness (Assimetria): 1.7426236420109267
Kurtosis (Curtose): 5.107524847568088

plt.figure(figsize=(10, 6))
sns.histplot(y, kde=True, bins=30, color='blue')
plt.title("Histograma de SalePrice")
plt.xlabel("Preço de Venda")
plt.ylabel("Frequência")
plt.show()

X.head()

for feature in selected_features:
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=X[feature], y=y, palette="viridis")
    plt.title(f"Relação entre {feature} e SalePrice")
    plt.xlabel(feature)
    plt.ylabel("SalePrice")
    plt.show()

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2928 entries, 0 to 2929
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Gr Liv Area    2928 non-null   int64  
 1   Garage Cars    2928 non-null   float64
 2   Total Bsmt SF  2928 non-null   float64
 3   Lot Area       2928 non-null   int64  
 4   Overall Qual   2928 non-null   int64  
 5   Kitchen Qual   2928 non-null   object 
 6   Overall Cond   2928 non-null   int64  
 7   Fireplaces     2928 non-null   int64  
 8   Wood Deck SF   2928 non-null   int64  
 9   Bedroom AbvGr  2928 non-null   int64  
 10  TotRms AbvGrd  2928 non-null   int64  
 11  HouseAge       2928 non-null   int64  
 12  Full Bath      2928 non-null   int64  
 13  Bldg Type      2928 non-null   object 
dtypes: float64(2), int64(10), object(2)
memory usage: 343.1+ KB

numerical_features = [
        "Gr Liv Area", "Garage Cars", "Total Bsmt SF", "Lot Area", "Fireplaces",
    "Wood Deck SF", "Bedroom AbvGr", "TotRms AbvGrd", "HouseAge", "Full Bath"
]

for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=X[feature], y=y, color='orange', label='Dados de dispersão')
    sns.regplot(x=X[feature], y=y, scatter=False, color='blue', label='Regressão Linear')
    plt.title(f"Relação entre {feature} e SalePrice")
    plt.xlabel(feature)
    plt.ylabel("SalePrice")
    plt.legend()
    plt.show()

X[numerical_features].corr()

plt.figure(figsize=(12, 8))
correlation_matrix = X[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matriz de Correlação - Variáveis Numéricas")
plt.show()

vif_data = pd.DataFrame()
vif_data["Feature"] = numerical_features
vif_data["VIF"] = [variance_inflation_factor(X[numerical_features].values, i) for i in range(X[numerical_features].shape[1])]

print("\nVariance Inflation Factor (VIF):")
print(vif_data)

Variance Inflation Factor (VIF):
         Feature        VIF
0    Gr Liv Area  40.391979
1    Garage Cars  10.312104
2  Total Bsmt SF   8.848557
3       Lot Area   3.078425
4     Fireplaces   2.575616
5   Wood Deck SF   1.751656
6  Bedroom AbvGr  25.504382
7  TotRms AbvGrd  64.253768
8       HouseAge   4.314102
9      Full Bath  16.892883

categorical_features = ["Overall Qual", "Kitchen Qual", "Overall Cond", "Bldg Type"]

for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=data[feature], y=y, palette="viridis")
    plt.title(f"Relação entre {feature} e SalePrice")
    plt.xlabel(feature)
    plt.ylabel("SalePrice")
    plt.show()

X

categorical_features_dummies = ['Kitchen Qual','Bldg Type']

X = pd.get_dummies(X, columns=categorical_features_dummies, drop_first=True)

X

# X = X.dropna()
# y = y[X.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

print(X.dtypes)

Gr Liv Area           int64
Garage Cars         float64
Total Bsmt SF       float64
Lot Area              int64
Overall Qual          int64
Overall Cond          int64
Fireplaces            int64
Wood Deck SF          int64
Bedroom AbvGr         int64
TotRms AbvGrd         int64
HouseAge              int64
Full Bath             int64
Kitchen Qual_Fa        bool
Kitchen Qual_Gd        bool
Kitchen Qual_Po        bool
Kitchen Qual_TA        bool
Bldg Type_2fmCon       bool
Bldg Type_Duplex       bool
Bldg Type_Twnhs        bool
Bldg Type_TwnhsE       bool
dtype: object

model.fit(X_train, y_train)

LinearRegression()

LinearRegression()

coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_})
intercept = model.intercept_

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

residuals = y_test - y_pred_test

print("Coeficientes do modelo:")
print(coefficients)

Coeficientes do modelo:
             Feature   Coefficient
0        Gr Liv Area  5.284067e+01
1        Garage Cars  9.268205e+03
2      Total Bsmt SF  2.267557e+01
3           Lot Area  4.781547e-01
4       Overall Qual  1.386263e+04
5       Overall Cond  5.613652e+03
6         Fireplaces  6.895560e+03
7       Wood Deck SF  2.425044e+01
8      Bedroom AbvGr -7.058272e+03
9      TotRms AbvGrd  3.912949e+02
10          HouseAge -5.117012e+02
11         Full Bath -7.757054e+02
12   Kitchen Qual_Fa -4.978184e+04
13   Kitchen Qual_Gd -5.009302e+04
14   Kitchen Qual_Po  1.818989e-11
15   Kitchen Qual_TA -5.701537e+04
16  Bldg Type_2fmCon -5.484163e+03
17  Bldg Type_Duplex -1.590632e+04
18   Bldg Type_Twnhs -2.308489e+04
19  Bldg Type_TwnhsE -1.510283e+04

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# quanto menor melhor
print(f'O MSE do modelo é: {train_rmse} para os dados de treino e {test_rmse} para os dados de teste!')

O MSE do modelo é: 30689.400546900368 para os dados de treino e 37548.64604586386 para os dados de teste!

train_mae = np.sqrt(mean_absolute_error(y_train, y_pred_train))
test_mae = np.sqrt(mean_absolute_error(y_test, y_pred_test))

# quanto menor melhor
print(f'O MAE do modelo é: {train_mae} para os dados de treino e {test_mae} para os dados de teste!')

O MAE do modelo é: 138.84418172919277 para os dados de treino e 144.42266773369224 para os dados de teste!

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'O R2 do modelo é: {train_r2} para os dados de treino e {test_r2} para os dados de teste!')

O R2 do modelo é: 0.8408284871126857 para os dados de treino e 0.8285043729939088 para os dados de teste!

print("O MSE do modelo é:", train_rmse)
print("O MAE do modelo é:", train_mae)
print("O R² do modelo é:", train_r2)

O MSE do modelo é: 30689.400546900368
O MAE do modelo é: 138.84418172919277
O R² do modelo é: 0.8408284871126857

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred_test, y=residuals, color="blue")
plt.axhline(0, linestyle="--", color="red")
plt.title("Resíduos vs. Valores Preditos")
plt.xlabel("Valores Preditos")
plt.ylabel("Resíduos")
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color="purple")
plt.title("Distribuição dos Resíduos")
plt.xlabel("Resíduos")
plt.ylabel("Frequência")
plt.show()

sm.qqplot(residuals, line="s")
plt.title("QQ-Plot dos Resíduos")
plt.show()

print("Média dos resíduos:", np.mean(residuals))
print("Desvio padrão dos resíduos:", np.std(residuals))

Média dos resíduos: 1820.1445778434927
Desvio padrão dos resíduos: 37504.504977313256

	Order	PID	MS SubClass	MS Zoning	Lot Frontage	Lot Area	Street	Alley	Lot Shape	Land Contour	...	Pool QC	Fence	Misc Feature	Misc Val	Mo Sold	Yr Sold	Sale Type	Sale Condition	SalePrice
0	1	526301100	20	RL	141.0	31770	Pave	NaN	IR1	Lvl	...	NaN	NaN	NaN	0	5	2010	WD	Normal	215000
1	2	526350040	20	RH	80.0	11622	Pave	NaN	Reg	Lvl	...	NaN	MnPrv	NaN	0	6	2010	WD	Normal	105000
2	3	526351010	20	RL	81.0	14267	Pave	NaN	IR1	Lvl	...	NaN	NaN	Gar2	12500	6	2010	WD	Normal	172000
3	4	526353030	20	RL	93.0	11160	Pave	NaN	Reg	Lvl	...	NaN	NaN	NaN	0	4	2010	WD	Normal	244000
4	5	527105010	60	RL	74.0	13830	Pave	NaN	IR1	Lvl	...	NaN	MnPrv	NaN	0	3	2010	WD	Normal	189900

	Gr Liv Area	Garage Cars	Total Bsmt SF	Lot Area	Fireplaces	Wood Deck SF	Bedroom AbvGr	TotRms AbvGrd	HouseAge	Full Bath
Gr Liv Area	1.000000	0.488621	0.444819	0.285517	0.455029	0.249830	0.516607	0.807800	-0.241949	0.630107
Garage Cars	0.488621	1.000000	0.437608	0.179368	0.320990	0.241035	0.091036	0.355230	-0.537312	0.477997
Total Bsmt SF	0.444819	0.437608	1.000000	0.253577	0.333007	0.230049	0.051969	0.280721	-0.407479	0.325135
Lot Area	0.285517	0.179368	0.253577	1.000000	0.256865	0.157141	0.136421	0.216403	-0.023044	0.127326
Fireplaces	0.455029	0.320990	0.333007	0.256865	1.000000	0.228134	0.076737	0.302503	-0.170046	0.229850
Wood Deck SF	0.249830	0.241035	0.230049	0.157141	0.228134	1.000000	0.029422	0.154494	-0.229263	0.179224
Bedroom AbvGr	0.516607	0.091036	0.051969	0.136421	0.076737	0.029422	1.000000	0.672529	0.055334	0.359251
TotRms AbvGrd	0.807800	0.355230	0.280721	0.216403	0.302503	0.154494	0.672529	1.000000	-0.111429	0.528506
HouseAge	-0.241949	-0.537312	-0.407479	-0.023044	-0.170046	-0.229263	0.055334	-0.111429	1.000000	-0.469936
Full Bath	0.630107	0.477997	0.325135	0.127326	0.229850	0.179224	0.359251	0.528506	-0.469936	1.000000

	Gr Liv Area	Garage Cars	Total Bsmt SF	Lot Area	Overall Qual	Kitchen Qual	Overall Cond	Fireplaces	Wood Deck SF	Bedroom AbvGr	TotRms AbvGrd	HouseAge	Full Bath	Bldg Type
0	1656	2.0	1080.0	31770	6	TA	5	2	210	3	7	64	1	1Fam
1	896	1.0	882.0	11622	5	TA	6	0	140	2	5	63	1	1Fam
2	1329	1.0	1329.0	14267	6	Gd	6	0	393	3	6	66	1	1Fam
3	2110	2.0	2110.0	11160	7	Ex	5	2	0	3	8	56	2	1Fam
4	1629	2.0	928.0	13830	5	TA	5	1	212	3	6	27	2	1Fam
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2925	1003	2.0	1003.0	7937	6	TA	6	0	120	3	6	40	1	1Fam
2926	902	2.0	864.0	8885	5	TA	5	0	164	2	5	41	1	1Fam
2927	970	0.0	912.0	10441	5	TA	5	0	80	3	6	32	1	1Fam
2928	1389	2.0	1389.0	10010	5	TA	5	1	240	2	6	50	1	1Fam
2929	2000	3.0	996.0	9627	7	TA	5	1	190	3	9	31	2	1Fam

Programa de Pós-graduação em Computação Aplicada – PPCA (UnB)

Análise Estatística de Dados e Informações

Professor: João Gabriel de Moraes Souza

Aluno: Angelo Donizete Buso Júnior

Índice ¶

1. Compreensão Problema de Negócio ¶

1.1 Dicionário dados ¶

2. Coleta Dados ¶

2.1 Carga dados ¶

3. Análise Exploratória ¶

3.1 Visão Geral dos dados ¶

3.1.1 Variáveis Explanatórias e Variável Dependente ¶

3.1.2 Missing Values ¶

3.1.3 Análise Variável Dependente ¶

3.1.3.1 Amplitude ¶

3.1.3.2 Plot - boxplot ¶

3.1.3.3 Plots Variável Dependente ¶

3.1.3.4 Distribuição Dados - skw/Kurt ¶

3.1.3.5 Relações Variável Dependente ¶

3.1.4 Análise da Variáveis Explanatórias ¶

3.1.4.1 Classificação das variáveis por tipo de dados ¶

3.1.4.2 Explanatórias Numéricas ¶

3.1.4.2.1 Plot - scatterplots ¶

3.1.4.2.2 Correlação Atributos ¶

3.1.4.2.3 Plot - Matriz de Correlação ¶

3.1.4.2.4 Avaliando a Multicolinearidade ¶

3.1.4.3 Explanatórias Categóricas ¶

4. Pré-Processamento dados ¶

4.1 Feature Engineer ¶

4.1.1 Dummies ¶

4.1.2 Imputação Missing ¶

4.2 Split dados ¶

4.2.1 Estático - nível linha ¶

5. Seleção Algoritmos ¶

5.1 Algoritmos Regressores ¶

5.1.1 Regressão Linear ¶

5.2 Performance para Regressores ¶

5.2.1 MSE ¶

5.2.2 MAE ¶

5.2.3 R² ¶

Consolidando as principais Técnicas de performance para Regressores¶

5.3 Distribuição dos resíduos ¶

6. Conclusões ¶

7. Bibliografia ¶