# Análise de Dados e Visualização**
import pandas as pd               # Manipulação de dados
import numpy as np                # Operações numéricas
import matplotlib.pyplot as plt   # Visualização gráfica
import seaborn as sns             # Visualização gráfica avançada
from scipy import stats           # Estatísticas e testes estatísticos

# Pré-Processamento de Dados**
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Normalização e codificação
from imblearn.under_sampling import RandomUnderSampler          # Balanceamento (undersampling)
from imblearn.over_sampling import SMOTE                        # Balanceamento (oversampling)
from collections import Counter                                 # Contagem de classes
from statsmodels.stats.outliers_influence import variance_inflation_factor  # VIF para multicolinearidade

# Algoritmos de Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Bibliotecas para Treinamento e Validação**
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold  # Divisão de dados e validação cruzada

# Avaliação de Modelos**
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, classification_report, roc_auc_score, roc_curve, ConfusionMatrixDisplay
)  # Métricas de avaliação
from scikitplot.metrics import plot_confusion_matrix, plot_roc  # Visualização de métricas

# Ignorar Warnings**
import warnings
warnings.filterwarnings("ignore")

# Verificação das Versões das Bibliotecas**
print("Versão do Seaborn:", sns.__version__)
print("Versão do Pandas:", pd.__version__)

Versão do Seaborn: 0.13.2
Versão do Pandas: 2.2.3

path = "/home/buso/mestrado/aedi-ppca/dados/marketing_campaign.csv"
df = pd.read_csv(path, sep="\t")
print('Os dados possuem {} linhas e {} colunas'.format(df.shape[0], df.shape[1]))
df.head()

Os dados possuem 2240 linhas e 29 colunas

def visaogeral(df, mensagem):
    print(f'{mensagem}:\n')
    print("Qtd Observações:", df.shape[0])
    print("\nQtd Atributos:", df.shape[1])
    print("\nAtributos:")
    print(df.columns.values)
    # print(df.columns.tolist())
    print("\nQtd Valores missing:", df.isnull().sum().values.sum())
    print("\nValores Unicos:")
    print(df.nunique())

visaogeral(df,'Visão Geral do dataSet')

Visão Geral do dataSet:

Qtd Observações: 2240

Qtd Atributos: 29

Atributos:
['ID' 'Year_Birth' 'Education' 'Marital_Status' 'Income' 'Kidhome'
 'Teenhome' 'Dt_Customer' 'Recency' 'MntWines' 'MntFruits'
 'MntMeatProducts' 'MntFishProducts' 'MntSweetProducts' 'MntGoldProds'
 'NumDealsPurchases' 'NumWebPurchases' 'NumCatalogPurchases'
 'NumStorePurchases' 'NumWebVisitsMonth' 'AcceptedCmp3' 'AcceptedCmp4'
 'AcceptedCmp5' 'AcceptedCmp1' 'AcceptedCmp2' 'Complain' 'Z_CostContact'
 'Z_Revenue' 'Response']

Qtd Valores missing: 24

Valores Unicos:
ID                     2240
Year_Birth               59
Education                 5
Marital_Status            8
Income                 1974
Kidhome                   3
Teenhome                  3
Dt_Customer             663
Recency                 100
MntWines                776
MntFruits               158
MntMeatProducts         558
MntFishProducts         182
MntSweetProducts        177
MntGoldProds            213
NumDealsPurchases        15
NumWebPurchases          15
NumCatalogPurchases      14
NumStorePurchases        14
NumWebVisitsMonth        16
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp5              2
AcceptedCmp1              2
AcceptedCmp2              2
Complain                  2
Z_CostContact             1
Z_Revenue                 1
Response                  2
dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  AcceptedCmp3         2240 non-null   int64  
 21  AcceptedCmp4         2240 non-null   int64  
 22  AcceptedCmp5         2240 non-null   int64  
 23  AcceptedCmp1         2240 non-null   int64  
 24  AcceptedCmp2         2240 non-null   int64  
 25  Complain             2240 non-null   int64  
 26  Z_CostContact        2240 non-null   int64  
 27  Z_Revenue            2240 non-null   int64  
 28  Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB

df.head()

df.tail()

print('Os dados possuem {:.2f} linhas e {} colunas'.format(df.shape[0], df.shape[1]))

Os dados possuem 2240.00 linhas e 29 colunas

# checando índice (endereço de cada obs) e sua distribuição
df.index

RangeIndex(start=0, stop=2240, step=1)

# checando os nomes das colunas
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

df['Complain'].describe()

count    2240.000000
mean        0.009375
std         0.096391
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Complain, dtype: float64

df['Complain'].unique()

array([0, 1])

print(df['Complain'].value_counts())
print("\nObserva-se que {:.4f}% do dataSet fizeram reclamações.".format((df.groupby('Complain').size()[1] / df.shape[0])*100))
print("Enquanto que {:.4f}% dos dados não realizaram nenhuma reclamação.".format((df.groupby('Complain').size()[0] / df.shape[0])*100))

Complain
0    2219
1      21
Name: count, dtype: int64

Observa-se que 0.9375% do dataSet fizeram reclamações.
Enquanto que 99.0625% dos dados não realizaram nenhuma reclamação.

%matplotlib inline
# seto algumas caracteristicas para os plots. Padornizar Plots
sns.set_theme(style='darkgrid')
sns.set_palette("hls", 3)

balData = pd.DataFrame(df.Complain.value_counts())
balData['% total'] = round(100*balData['count']/df.shape[0], 2)

print(balData)
y_plot = sns.countplot(data=df, x='Complain', order=df.Complain.value_counts().index, hue= 'Complain')

plt.title('Distribuição das classes')
plt.ylabel('Quantidade')
plt.xlabel('Reclamações')

plt.tight_layout()
plt.show()

          count  % total
Complain                
0          2219    99.06
1            21     0.94

df_graph_customer = df[['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer','Recency','Complain']]
df_graph_customer.head()

df_graph_customer.groupby('Marital_Status').size().plot(kind='barh', color=sns.palettes.mpl_palette('magma'))
plt.gca().spines[['top', 'right',]].set_visible(False)

sns.countplot(x='Marital_Status', hue='Complain', data=df)
plt.title('Reclamações por Nível de Educação')
plt.xlabel('Nível de Educação')
plt.ylabel('Contagem')
plt.show()

df['Marital_Status'].groupby(df['Complain']).value_counts()

Complain  Marital_Status
0         Married           856
          Together          575
          Single            474
          Divorced          230
          Widow              77
          Alone               3
          Absurd              2
          YOLO                2
1         Married             8
          Single              6
          Together            5
          Divorced            2
Name: count, dtype: int64

df_graph_customer.groupby('Education').size().plot(kind='barh', color=sns.palettes.mpl_palette('magma'))
plt.gca().spines[['top', 'right',]].set_visible(False)

sns.countplot(x='Education', hue='Complain', data=df)
plt.title('Reclamações por Nível de Educação')
plt.xlabel('Nível de Educação')
plt.ylabel('Contagem')
plt.show()

df['Education'].groupby(df['Complain']).value_counts()

Complain  Education 
0         Graduation    1113
          PhD            485
          Master         368
          2n Cycle       199
          Basic           54
1         Graduation      14
          2n Cycle         4
          Master           2
          PhD              1
Name: count, dtype: int64

df['Year_Birth'].describe()

count    2240.000000
mean     1968.805804
std        11.984069
min      1893.000000
25%      1959.000000
50%      1970.000000
75%      1977.000000
max      1996.000000
Name: Year_Birth, dtype: float64

df['Age']= 2025 - df['Year_Birth']
df.head()

df['Age'].describe()

count    2240.000000
mean       56.194196
std        11.984069
min        29.000000
25%        48.000000
50%        55.000000
75%        66.000000
max       132.000000
Name: Age, dtype: float64

df['Year_Birth'].plot(kind='hist', bins=20, title='Year_Birth')
plt.gca().spines[['top', 'right',]].set_visible(False)

plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Idade')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()

df['Dt_Customer'].describe()

count           2240
unique           663
top       31-08-2012
freq              12
Name: Dt_Customer, dtype: object

df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)

df['Dt_Customer']

0      2012-09-04
1      2014-03-08
2      2013-08-21
3      2014-02-10
4      2014-01-19
          ...    
2235   2013-06-13
2236   2014-06-10
2237   2014-01-25
2238   2014-01-24
2239   2012-10-15
Name: Dt_Customer, Length: 2240, dtype: datetime64[ns]

df['Dt_Customer'].isnull().sum()

0

current_date = pd.to_datetime(datetime.now().date())
df['Years_Since_Customer'] = round((current_date - df['Dt_Customer']).dt.days / 365, 0)

df.Years_Since_Customer.describe()

count    2240.000000
mean       11.519196
std         0.499743
min        11.000000
25%        11.000000
50%        12.000000
75%        12.000000
max        12.000000
Name: Years_Since_Customer, dtype: float64

df['Years_Since_Customer'].groupby(df['Complain']).describe()

df['Complain'].loc[df['Year_Birth'] < 1932].value_counts()

Complain
0    2
1    1
Name: count, dtype: int64

df['Age'].loc[df['Year_Birth'] >= 1932].hist(bins=20);

sns.boxplot(y='Income', data=df)
plt.title('Renda Anual dos Clientes')

plt.tight_layout()
plt.show()

df['Income'].describe()

count      2216.000000
mean      52247.251354
std       25173.076661
min        1730.000000
25%       35303.000000
50%       51381.500000
75%       68522.000000
max      666666.000000
Name: Income, dtype: float64

df['Complain'].loc[df['Income'] == 666666]

2233    0
Name: Complain, dtype: int64

sns.boxplot(y='Income', data=df[df['Income'] < 666666])
plt.title('Renda Anual dos Clientes')

plt.tight_layout()
plt.show()

df['Income'][df['Income'] < 666666].describe()

count      2215.000000
mean      51969.861400
std       21526.320095
min        1730.000000
25%       35284.000000
50%       51373.000000
75%       68487.000000
max      162397.000000
Name: Income, dtype: float64

df[['Kidhome','Teenhome']].describe()

df['Total_filhos'] = df['Kidhome'] + df['Teenhome']
df[['Kidhome','Teenhome','Total_filhos']].head()

df[['Kidhome','Teenhome','Total_filhos']].describe()

df.head()

produtos = ['MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']

df[produtos].describe()

df.groupby('Complain')[['MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']].mean().rename(index={0: 'nao_reclamou', 1: 'reclamou'})

for produto in produtos:
    gastou = (df[produto] > 0).sum()
    nao_gastou = (df[produto] == 0).sum()
    total = len(df)
    print(f"\n{produto}:")
    print(f"Clientes que gastaram (>0): {gastou} ({(gastou / total) * 100:.2f}%)")
    print(f"Clientes que não gastaram (0): {nao_gastou} ({(nao_gastou / total) * 100:.2f}%)")

MntFruits:
Clientes que gastaram (>0): 1840 (82.14%)
Clientes que não gastaram (0): 400 (17.86%)

MntMeatProducts:
Clientes que gastaram (>0): 2239 (99.96%)
Clientes que não gastaram (0): 1 (0.04%)

MntFishProducts:
Clientes que gastaram (>0): 1856 (82.86%)
Clientes que não gastaram (0): 384 (17.14%)

MntSweetProducts:
Clientes que gastaram (>0): 1821 (81.29%)
Clientes que não gastaram (0): 419 (18.71%)

MntGoldProds:
Clientes que gastaram (>0): 2179 (97.28%)
Clientes que não gastaram (0): 61 (2.72%)

fig, ax =  plt.subplots(1, 5, figsize=(25,8))
df[['MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']].hist(bins=20, ax=ax)
plt.tight_layout()

df.plot(kind='scatter', x='MntMeatProducts', y='MntWines', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

sns.boxplot(x='Complain', y='MntWines', data=df, hue='Complain')
plt.title('Gasto em Vinhos vs Reclamações')
plt.xlabel('Reclamou?')
plt.ylabel('Gasto com Vinhos')
plt.xticks([0, 1], ['Não', 'Sim'])
plt.show()

df[['NumDealsPurchases','AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response','Complain']]

df[['NumDealsPurchases','AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response']].describe()

df[['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response']].sum()

AcceptedCmp1    144
AcceptedCmp2     30
AcceptedCmp3    163
AcceptedCmp4    167
AcceptedCmp5    163
Response        334
dtype: int64

df.groupby('Complain')[['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response']].sum().rename(index={0: 'nao_reclamou', 1: 'reclamou'})

promocoes = ['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5', 'Response']

for promocao in promocoes:
    aceitou = (df[promocao] == 1).sum()
    nao_aceitou = (df[promocao] == 0).sum()
    total = len(df)
    print(f"\n")
    print(f"Clientes que aceitaram a promocao {promocao}: {aceitou} ({(aceitou / total) * 100:.2f}%)")
    print(f"Clientes que não aceitaram a promocao {promocao}: {nao_aceitou} ({(nao_aceitou / total) * 100:.2f}%)")


Clientes que aceitaram a promocao AcceptedCmp1: 144 (6.43%)
Clientes que não aceitaram a promocao AcceptedCmp1: 2096 (93.57%)


Clientes que aceitaram a promocao AcceptedCmp2: 30 (1.34%)
Clientes que não aceitaram a promocao AcceptedCmp2: 2210 (98.66%)


Clientes que aceitaram a promocao AcceptedCmp3: 163 (7.28%)
Clientes que não aceitaram a promocao AcceptedCmp3: 2077 (92.72%)


Clientes que aceitaram a promocao AcceptedCmp4: 167 (7.46%)
Clientes que não aceitaram a promocao AcceptedCmp4: 2073 (92.54%)


Clientes que aceitaram a promocao AcceptedCmp5: 163 (7.28%)
Clientes que não aceitaram a promocao AcceptedCmp5: 2077 (92.72%)


Clientes que aceitaram a promocao Response: 334 (14.91%)
Clientes que não aceitaram a promocao Response: 1906 (85.09%)

df[['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']].sum()

NumWebPurchases         9150
NumCatalogPurchases     5963
NumStorePurchases      12970
NumWebVisitsMonth      11909
dtype: int64

df[['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].groupby(df['Complain']).sum()

print(f'A quantidade de visitas no site da empresa foi de: {df["NumWebVisitsMonth"].sum()}')

A quantidade de visitas no site da empresa foi de: 11909

df['NumWebVisitsMonth'].groupby(df['Complain']).sum()

Complain
0    11787
1      122
Name: NumWebVisitsMonth, dtype: int64

df.head()

df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Years_Since_Customer', 'Total_filhos'],
      dtype='object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   ID                    2240 non-null   int64         
 1   Year_Birth            2240 non-null   int64         
 2   Education             2240 non-null   object        
 3   Marital_Status        2240 non-null   object        
 4   Income                2216 non-null   float64       
 5   Kidhome               2240 non-null   int64         
 6   Teenhome              2240 non-null   int64         
 7   Dt_Customer           2240 non-null   datetime64[ns]
 8   Recency               2240 non-null   int64         
 9   MntWines              2240 non-null   int64         
 10  MntFruits             2240 non-null   int64         
 11  MntMeatProducts       2240 non-null   int64         
 12  MntFishProducts       2240 non-null   int64         
 13  MntSweetProducts      2240 non-null   int64         
 14  MntGoldProds          2240 non-null   int64         
 15  NumDealsPurchases     2240 non-null   int64         
 16  NumWebPurchases       2240 non-null   int64         
 17  NumCatalogPurchases   2240 non-null   int64         
 18  NumStorePurchases     2240 non-null   int64         
 19  NumWebVisitsMonth     2240 non-null   int64         
 20  AcceptedCmp3          2240 non-null   int64         
 21  AcceptedCmp4          2240 non-null   int64         
 22  AcceptedCmp5          2240 non-null   int64         
 23  AcceptedCmp1          2240 non-null   int64         
 24  AcceptedCmp2          2240 non-null   int64         
 25  Complain              2240 non-null   int64         
 26  Z_CostContact         2240 non-null   int64         
 27  Z_Revenue             2240 non-null   int64         
 28  Response              2240 non-null   int64         
 29  Age                   2240 non-null   int64         
 30  Years_Since_Customer  2240 non-null   float64       
 31  Total_filhos          2240 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(27), object(2)
memory usage: 560.1+ KB

df_proc = df.copy()
df_proc.head()

df_proc.shape

(2240, 32)

df_proc = df_proc[df_proc['Year_Birth'] >= 1932]

df_proc.shape

(2237, 32)

df_proc.drop(['ID', 'Z_CostContact', 'Z_Revenue', 'Year_Birth', 'Kidhome', 'Teenhome' , 'Dt_Customer'], axis= 1, inplace= True)
df_proc.head()

df_proc.shape

(2237, 25)

df_proc = df_proc[df_proc['Income'] != 666666]

df_proc.shape

(2236, 25)

from sklearn.impute import SimpleImputer
import numpy as np

# Criando o imputador com estratégia de mediana
imp_num = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=True)

# Aplicando a imputação nos dados
dados_imputados = imp_num.fit_transform(dados)

df_proc.isnull().sum()

Education                0
Marital_Status           0
Income                  24
Recency                  0
MntWines                 0
MntFruits                0
MntMeatProducts          0
MntFishProducts          0
MntSweetProducts         0
MntGoldProds             0
NumDealsPurchases        0
NumWebPurchases          0
NumCatalogPurchases      0
NumStorePurchases        0
NumWebVisitsMonth        0
AcceptedCmp3             0
AcceptedCmp4             0
AcceptedCmp5             0
AcceptedCmp1             0
AcceptedCmp2             0
Complain                 0
Response                 0
Age                      0
Years_Since_Customer     0
Total_filhos             0
dtype: int64

df_proc.shape

(2236, 25)

df['Complain'].loc[df['Income'].isnull()].value_counts()

Complain
0    24
Name: count, dtype: int64

df_proc = df_proc.dropna(subset=['Income'])
df_proc.shape

(2212, 25)

df_proc.isnull().sum()

Education               0
Marital_Status          0
Income                  0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Response                0
Age                     0
Years_Since_Customer    0
Total_filhos            0
dtype: int64

label_encoder = LabelEncoder()

df_proc['Education'] = label_encoder.fit_transform(df_proc['Education'])
df_proc['Marital_Status'] = label_encoder.fit_transform(df_proc['Marital_Status'])

print(df_proc[['Education', 'Marital_Status']].head())

   Education  Marital_Status
0          2               4
1          2               4
2          2               5
3          2               5
4          4               3

df_proc.shape

(2212, 25)

df_proc.head()

df_proc.columns

Index(['Education', 'Marital_Status', 'Income', 'Recency', 'MntWines',
       'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Years_Since_Customer',
       'Total_filhos'],
      dtype='object')

X = df_proc.drop(columns= 'Complain', axis= 1)
y = df_proc.Complain

y.shape

(2212,)

print('DataSet original com {} atributos e {} observações'.format(df.shape[1], df.shape[0]))
print('As Variáveis Explicatórias possuem {} atributos e {} observações'.format(X.shape[1], X.shape[0]))
print('A Variável Alvo possuem {} observações'.format(y.shape))

DataSet original com 32 atributos e 2240 observações
As Variáveis Explicatórias possuem 24 atributos e 2212 observações
A Variável Alvo possuem (2212,) observações

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, stratify=y, shuffle=True)

print('tamanho total dos dados são {}, para treino temos {} e teste são {}'.format(df_proc.shape[0], len(X_treino), len(X_teste)))

tamanho total dos dados são 2212, para treino temos 1659 e teste são 553

# a. instanciar as classes dos métodos pretendidos
smote = SMOTE(random_state=42)

# b. usam método fit_sample() para usar a técnica
X_resampled, y_resampled = smote.fit_resample(X_treino, y_treino)

# Verificar a nova distribuição das classes
print("Distribuição antes SMOTE:", Counter(y_treino))
print("Distribuição após SMOTE:", Counter(y_resampled))

Distribuição antes SMOTE: Counter({0: 1644, 1: 15})
Distribuição após SMOTE: Counter({0: 1644, 1: 1644})

X_resampled.isnull().sum()

Education               0
Marital_Status          0
Income                  0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Response                0
Age                     0
Years_Since_Customer    0
Total_filhos            0
dtype: int64

# X_resampled = X_resampled.replace([np.inf, -np.inf], np.nan)
# X_resampled = X_resampled.dropna()

vif_data = pd.DataFrame()
vif_data['Feature'] = X_resampled.columns
vif_data['VIF'] = [variance_inflation_factor(X_resampled.values, i) for i in range(X_resampled.shape[1])]

print("Valores de VIF para cada variável:")
print(vif_data)

Valores de VIF para cada variável:
                 Feature         VIF
0              Education    8.068871
1         Marital_Status   16.821575
2                 Income   40.592027
3                Recency    5.972475
4               MntWines    7.080768
5              MntFruits    4.332739
6        MntMeatProducts    5.726406
7        MntFishProducts    4.348500
8       MntSweetProducts    2.740258
9           MntGoldProds    2.438703
10     NumDealsPurchases    4.340176
11       NumWebPurchases    8.099010
12   NumCatalogPurchases    7.083520
13     NumStorePurchases   12.706127
14     NumWebVisitsMonth   20.584871
15          AcceptedCmp3    1.230701
16          AcceptedCmp4    1.464427
17          AcceptedCmp5    1.762467
18          AcceptedCmp1    1.483911
19          AcceptedCmp2    1.196208
20              Response    1.580624
21                   Age   37.142746
22  Years_Since_Customer  102.437615
23          Total_filhos    5.287758

# (a) instancia as classes dos algoritmos

algoritmos = [
              RandomForestClassifier(),
              DecisionTreeClassifier(),
              SGDClassifier(),
              SVC(),
              LogisticRegression(),
              #XGBClassifier(),
              LGBMClassifier()
              ]

# (b) com dados balanceado com a técinca de OverSampling
score_modelo = []
for classificador in algoritmos:
    score = cross_val_score(classificador,
                            X_resampled,
                            y_resampled,
                            scoring='recall')
    score_modelo.append(
        {'Estimator':classificador.__class__.__name__, 'Score médio':score.mean()}
        )
    print('Usando dados bancalanceados com a técnica de OverSampling')
pd.DataFrame(score_modelo)

Usando dados bancalanceados com a técnica de OverSampling
Usando dados bancalanceados com a técnica de OverSampling
Usando dados bancalanceados com a técnica de OverSampling
Usando dados bancalanceados com a técnica de OverSampling
Usando dados bancalanceados com a técnica de OverSampling
[LightGBM] [Info] Number of positive: 1315, number of negative: 1315
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1826
[LightGBM] [Info] Number of data points in the train set: 2630, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1315, number of negative: 1315
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 2630, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1315, number of negative: 1315
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1835
[LightGBM] [Info] Number of data points in the train set: 2630, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1316, number of negative: 1315
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1833
[LightGBM] [Info] Number of data points in the train set: 2631, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500190 -> initscore=0.000760
[LightGBM] [Info] Start training from score 0.000760
[LightGBM] [Info] Number of positive: 1315, number of negative: 1316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1824
[LightGBM] [Info] Number of data points in the train set: 2631, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499810 -> initscore=-0.000760
[LightGBM] [Info] Start training from score -0.000760
Usando dados bancalanceados com a técnica de OverSampling

# Instanciar o modelo
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=1,               
    min_samples_split=2,       
    min_samples_leaf=1,        
    max_features='sqrt',       
    random_state=42,
    class_weight='balanced'
)

# Treinar o modelo
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(class_weight='balanced', max_depth=1, random_state=42)

RandomForestClassifier(class_weight='balanced', max_depth=1, random_state=42)

# Extrair as importâncias das variáveis
feature_importances = rf.feature_importances_

# Criar um DataFrame com os nomes das variáveis e suas importâncias
coef_df = pd.DataFrame({'Feature': X_resampled.columns, 'Importance': feature_importances})

# Ordenar pelas importâncias
print(coef_df.sort_values(by='Importance', ascending=False))

                 Feature  Importance
0              Education        0.17
4               MntWines        0.12
1         Marital_Status        0.11
12   NumCatalogPurchases        0.11
13     NumStorePurchases        0.10
20              Response        0.09
6        MntMeatProducts        0.08
2                 Income        0.05
22  Years_Since_Customer        0.04
3                Recency        0.03
7        MntFishProducts        0.03
17          AcceptedCmp5        0.02
8       MntSweetProducts        0.02
21                   Age        0.02
16          AcceptedCmp4        0.01
19          AcceptedCmp2        0.00
18          AcceptedCmp1        0.00
14     NumWebVisitsMonth        0.00
15          AcceptedCmp3        0.00
5              MntFruits        0.00
11       NumWebPurchases        0.00
10     NumDealsPurchases        0.00
9           MntGoldProds        0.00
23          Total_filhos        0.00

# Probabilidades previstas para a classe positiva (1)
y_prob = rf.predict_proba(X_teste)[:, 1] 

# Curva ROC
fpr, tpr, thresholds = roc_curve(y_teste, y_prob)
auc = roc_auc_score(y_teste, y_prob)

# Plot da Curva ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}', color='blue')
plt.plot([0, 1], [0, 1], 'k--', label='Modelo Aleatório')
plt.title('Curva ROC - Random Forest')
plt.xlabel('Taxa de Falsos Positivos (FPR)')
plt.ylabel('Taxa de Verdadeiros Positivos (TPR)')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Previsões de classe
y_pred = rf.predict(X_teste)

# Relatório de classificação
print("\nRelatório de Classificação:\n", classification_report(y_teste, y_pred))

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.99      0.71      0.83       548
           1       0.01      0.40      0.02         5

    accuracy                           0.70       553
   macro avg       0.50      0.55      0.42       553
weighted avg       0.98      0.70      0.82       553

# Matriz de confusão
cm = confusion_matrix(y_teste, y_pred)
tn, fp, fn, tp = cm.ravel()

# Especificidade
specificity = tn / (tn + fp)
print(f"Especificidade: {specificity:.2f}")

# Exibição da Matriz de Confusão
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot(cmap='Blues', values_format='d')
plt.title('Matriz de Confusão - Random Forest')
plt.grid(False)
plt.show()

Especificidade: 0.71

	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds
Complain
nao_reclamou	26.322217	167.465525	37.636773	27.153222	44.178459
reclamou	24.190476	112.476190	25.761905	17.523810	27.476190

Programa de Pós-graduação em Computação Aplicada – PPCA (UnB)

Análise Estatística de Dados e Informações

Professor: João Gabriel de Moraes Souza

Aluno: Angelo Donizete Buso Júnior

1. Compreensão do Problema de Negócio (Business Understanding)¶

1.1 Dicionário de Dados¶

Informações sobre os atributos:¶

2. Coleta Dados¶

2.1 Carga dados¶

3. Análise Exploratória¶

3.1 Visão Geral dataSet¶

3.1.1 Variável Alvo¶

3.1.1.1 Amplitude¶

3.1.1.2 Balanceamento dos Dados¶

3.1.1.3 Plots Variável Alvo¶

3.1.2 Grupo Cliente¶

3.1.3 Grupo Produto¶

3.1.5 Grupo Local¶

4. Pré-Processamento dos Dados¶

Preparando os Dados para Machine Learning¶

4.1 Transformação de Variáveis/Atributos¶

4.1.1 Excluindo atributos sem relevancia¶

4.2 Feature Engineer - I¶

4.2.1 Imputação de Valores Ausentes¶

4.2.2 Codificação de Variáveis Categóricas (Dummies)¶

4.3 Split dados¶

4.3.1 Estático¶

4.4 Feature Engineer¶

4.4.1.Balanceamento de classes¶

4.4.1.1 Usando Oversampling¶

4.5 Avaliação de Multicolinearidade¶

5. Seleção de Algoritmos¶

5.1 Algoritmos de Classificação¶

Conclusões¶

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	...	NumWebVisitsMonth	Z_CostContact	Z_Revenue	Response
0	5524	1957	Graduation	Single	58138.0	0	0	04-09-2012	58	635	...	7	3	11	1
1	2174	1954	Graduation	Single	46344.0	1	1	08-03-2014	38	11	...	5	3	11	0
2	4141	1965	Graduation	Together	71613.0	0	0	21-08-2013	26	426	...	4	3	11	0
3	6182	1984	Graduation	Together	26646.0	1	0	10-02-2014	26	11	...	6	3	11	0
4	5324	1981	PhD	Married	58293.0	1	0	19-01-2014	94	173	...	5	3	11	0

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	...	NumWebVisitsMonth	AcceptedCmp4	AcceptedCmp1	Z_CostContact	Z_Revenue	Response
2235	10870	1967	Graduation	Married	61223.0	0	1	13-06-2013	46	709	...	5	0	0	3	11	0
2236	4001	1946	PhD	Together	64014.0	2	1	10-06-2014	56	406	...	7	0	1	3	11	0
2237	7270	1981	Graduation	Divorced	56981.0	0	0	25-01-2014	91	908	...	6	1	0	3	11	0
2238	8235	1956	Master	Together	69245.0	0	1	24-01-2014	8	428	...	3	0	0	3	11	0
2239	9405	1954	PhD	Married	52869.0	1	1	15-10-2012	40	84	...	7	0	0	3	11	1

	count	mean	std	min	25%	50%	75%	max
Complain
0	2219.0	11.518702	0.499763	11.0	11.0	12.0	12.0	12.0
1	21.0	11.571429	0.507093	11.0	11.0	12.0	12.0	12.0

	Kidhome	Teenhome
count	2240.000000	2240.000000
mean	0.444196	0.506250
std	0.538398	0.544538
min	0.000000	0.000000
25%	0.000000	0.000000
50%	0.000000	0.000000
75%	1.000000	1.000000
max	2.000000	2.000000

	Estimator	Score médio
0	RandomForestClassifier	0.995135
1	DecisionTreeClassifier	0.992097
2	SGDClassifier	0.400000
3	SVC	0.630163
4	LogisticRegression	0.794414
5	LGBMClassifier	0.995137

	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds
count	2240.000000	2240.000000	2240.000000	2240.000000	2240.000000
mean	26.302232	166.950000	37.525446	27.062946	44.021875
std	39.773434	225.715373	54.628979	41.280498	52.167439
min	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1.000000	16.000000	3.000000	1.000000	9.000000
50%	8.000000	67.000000	12.000000	8.000000	24.000000
75%	33.000000	232.000000	50.000000	33.000000	56.000000
max	199.000000	1725.000000	259.000000	263.000000	362.000000

	NumWebPurchases	NumCatalogPurchases	NumStorePurchases
Complain
0	9074	5920	12860
1	76	43	110

Programa de Pós-graduação em Computação Aplicada – PPCA (UnB)

Análise Estatística de Dados e Informações

Professor: João Gabriel de Moraes Souza

Aluno: Angelo Donizete Buso Júnior

1. Compreensão do Problema de Negócio (Business Understanding)¶

1.1 Dicionário de Dados¶

Informações sobre os atributos:¶

2. Coleta Dados¶

2.1 Carga dados¶

3. Análise Exploratória¶

3.1 Visão Geral dataSet¶

3.1.1 Variável Alvo¶

3.1.1.1 Amplitude¶

3.1.1.2 Balanceamento dos Dados¶

3.1.1.3 Plots Variável Alvo¶

3.1.2 Grupo Cliente¶

3.1.3 Grupo Produto¶

3.1.4 Grupo Promoção¶

3.1.5 Grupo Local¶

4. Pré-Processamento dos Dados¶

Preparando os Dados para Machine Learning¶

4.1 Transformação de Variáveis/Atributos¶

4.1.1 Excluindo atributos sem relevancia¶

4.2 Feature Engineer - I¶

4.2.1 Imputação de Valores Ausentes¶

4.2.2 Codificação de Variáveis Categóricas (Dummies)¶

4.3 Split dados¶

4.3.1 Estático¶

4.4 Feature Engineer¶

4.4.1.Balanceamento de classes¶

4.4.1.1 Usando Oversampling¶

4.5 Avaliação de Multicolinearidade¶

5. Seleção de Algoritmos¶

5.1 Algoritmos de Classificação¶

Conclusões¶