import warnings
warnings.filterwarnings("ignore")

# Manipulação de Dados
import pandas as pd
from IPython.display import display
import numpy as np

# Visualização de Dados
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# Estatística e Testes Estatísticos
import shap
from scipy import stats
from scipy.stats import kurtosis, skew, f_oneway, shapiro, levene, kruskal
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Pré-Processamento de Dados
from sklearn.model_selection import KFold, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

# Algoritmos de Machine Learning
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC

# Treinamento e Validação de Modelos
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

# Avaliação de Modelos
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    confusion_matrix, accuracy_score, precision_score, recall_score,
    classification_report, roc_auc_score, roc_curve, ConfusionMatrixDisplay
)
from scikitplot.metrics import plot_confusion_matrix, plot_roc

# Configurações Opcionais
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="whitegrid")
plt.rc("figure", figsize=(10, 6))

# Verificação de Versões
print("Versão do Seaborn:", sns.__version__)
print("Versão do Pandas:", pd.__version__)

Versão do Seaborn: 0.13.2
Versão do Pandas: 2.2.3

# Carregar o dataset
try:
    df_2010_2011 = pd.read_csv("/home/buso/mestrado/aedi-ppca/dados/Year 2010-2011.csv", encoding='latin1')
    df_2009_2010 = pd.read_csv("/home/buso/mestrado/aedi-ppca/dados/Year 2009-2010.csv", encoding='latin1')
    df_vendas = pd.read_csv("//home/buso/mestrado/aedi-ppca/dados/online_retail_II.csv", encoding='latin1',  on_bad_lines='skip', delimiter=';')
except FileNotFoundError:
    print("Arquivo não encontrado. Por favor, verifique o caminho do dataset.")

df_2009_2010.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

df_2010_2011.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

df_vendas.columns

Index(['ï»¿Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

df_vendas.columns

Index(['ï»¿Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

df = pd.concat([df_2009_2010, df_2010_2011], ignore_index=True)
df.head()

# 1. Informações gerais sobre colunas e tipos de dados
print("\nInformações gerais sobre o dataset:")
print(df.info())

Informações gerais sobre o dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB
None

# 2. Dimensionalidade do dataset
print(f"Dimensões do dataset: {df.shape}")

Dimensões do dataset: (1067371, 8)

# 3. Primeiras linhas do dataset
print("\nPrimeiras 5 linhas do dataset:")
df.head()

Primeiras 5 linhas do dataset:

# Exibir primeiras linhas
display(df.head())

# Verificar valores ausentes
display(df.isnull().sum())

# Verificar estatísticas descritivas
display(df.describe())

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

# 4. Exploração gráfica
plt.figure(figsize=(14, 6))
sns.boxplot(x="Country", y="Quantity", data=df)
plt.title("Distribuição da Quantidade Vendida por País")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(14, 6))
sns.boxplot(x="Country", y="Price", data=df)
plt.title("Distribuição do Preço Unitário por País")
plt.xticks(rotation=45)
plt.show()

print(f'Nos dados são comercializados cerca de {df['Description'].nunique()} produtos.')

Nos dados são comercializados cerca de 5698 produtos.

df['Description'].value_counts().head(10)

Description
WHITE HANGING HEART T-LIGHT HOLDER    5918
REGENCY CAKESTAND 3 TIER              4412
JUMBO BAG RED RETROSPOT               3469
ASSORTED COLOUR BIRD ORNAMENT         2958
PARTY BUNTING                         2765
STRAWBERRY CERAMIC TRINKET BOX        2613
LUNCH BAG  BLACK SKULL.               2529
JUMBO STORAGE BAG SUKI                2434
HEART OF WICKER SMALL                 2319
JUMBO SHOPPER VINTAGE RED PAISLEY     2297
Name: count, dtype: int64

print(df[['Quantity', 'Price']].describe())

        Quantity      Price
count 1067371.00 1067371.00
mean        9.94       4.65
std       172.71     123.55
min    -80995.00  -53594.36
25%         1.00       1.25
50%         3.00       2.10
75%        10.00       4.15
max     80995.00   38970.00

print(df['Country'].value_counts())

Country
United Kingdom          981330
EIRE                     17866
Germany                  17624
France                   14330
Netherlands               5140
Spain                     3811
Switzerland               3189
Belgium                   3123
Portugal                  2620
Australia                 1913
Channel Islands           1664
Italy                     1534
Norway                    1455
Sweden                    1364
Cyprus                    1176
Finland                   1049
Austria                    938
Denmark                    817
Unspecified                756
Greece                     663
Japan                      582
Poland                     535
USA                        535
United Arab Emirates       500
Israel                     371
Hong Kong                  364
Singapore                  346
Malta                      299
Iceland                    253
Canada                     228
Lithuania                  189
RSA                        169
Bahrain                    126
Brazil                      94
Thailand                    76
Korea                       63
European Community          61
Lebanon                     58
West Indies                 54
Bermuda                     34
Nigeria                     32
Czech Republic              30
Saudi Arabia                10
Name: count, dtype: int64

media_quantidade_preco = df.groupby('Country')[['Quantity', 'Price']].mean().reset_index()

print(media_quantidade_preco.sort_values(by='Quantity', ascending=False))

                 Country  Quantity  Price
10               Denmark    287.90   2.93
4                Bermuda     82.29   2.49
26           Netherlands     74.31   3.05
35                Sweden     64.42   6.39
21                 Japan     54.68   2.16
0              Australia     54.21   4.24
37              Thailand     33.58   3.00
33             Singapore     20.19  73.65
9         Czech Republic     19.73   2.94
17             Hong Kong     19.44  57.57
28                Norway     18.63  28.27
11                  EIRE     18.55   7.01
36           Switzerland     16.42   3.78
6                 Canada     16.04   4.64
24             Lithuania     15.65   2.62
19                Israel     14.78   3.57
13               Finland     13.65   4.99
39  United Arab Emirates     13.46   5.11
18               Iceland     12.99   2.55
14                France     12.91   4.69
15               Germany     12.74   3.83
7        Channel Islands     12.30   5.44
1                Austria     12.05   4.46
34                 Spain     11.85   5.39
31                   RSA     11.66  19.85
16                Greece     11.62   3.92
3                Belgium     11.25   4.73
30              Portugal     10.71   6.54
29                Poland     10.55   3.63
2                Bahrain     10.12   3.49
20                 Italy      9.98   5.53
22                 Korea      9.49   2.38
8                 Cyprus      9.09   5.39
41           Unspecified      8.88   4.52
40        United Kingdom      8.86   4.54
25                 Malta      8.33  21.99
12    European Community      8.15   4.82
23               Lebanon      7.88   6.18
32          Saudi Arabia      7.50   2.41
42           West Indies      7.31   2.27
38                   USA      6.92   3.28
5                 Brazil      5.80   2.73
27               Nigeria      1.75   3.42

# 5. Verificar valores ausentes
print("\nResumo de valores ausentes por coluna:")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
print(missing_data)

Resumo de valores ausentes por coluna:
Customer ID    243007
Description      4382
dtype: int64

# 6. Estatísticas descritivas para variáveis numéricas
print("\nEstatísticas descritivas das variáveis numéricas:")
print(df.describe().transpose())

Estatísticas descritivas das variáveis numéricas:
                 count     mean     std       min      25%      50%      75%  \
Quantity    1067371.00     9.94  172.71 -80995.00     1.00     3.00    10.00   
Price       1067371.00     4.65  123.55 -53594.36     1.25     2.10     4.15   
Customer ID  824364.00 15324.64 1697.46  12346.00 13975.00 15255.00 16797.00   

                 max  
Quantity    80995.00  
Price       38970.00  
Customer ID 18287.00

# 7. Listar variáveis categóricas
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nVariáveis categóricas:")
print(categorical_cols)

Variáveis categóricas:
Index(['Invoice', 'StockCode', 'Description', 'InvoiceDate', 'Country'], dtype='object')

df.nunique().sort_values()

Country           43
Quantity        1057
Price           2807
StockCode       5305
Description     5698
Customer ID     5942
InvoiceDate    47635
Invoice        53628
dtype: int64

saleprice_skewness = skew(df["Price"], bias=False)
print(f"Skewness de Price: {saleprice_skewness}")

Skewness de SalePrice: -69.16473385725452

plt.figure(figsize=(8, 6))
sns.boxplot(x=df["Price"], color="orange")
plt.title("Boxplot de Price para Identificação de Outliers")
plt.xlabel("Preço de Venda (Price)")
plt.show()

print(df["Price"].describe())
print(df["Price"].quantile([0.95]))

count   1067371.00
mean          4.65
std         123.55
min      -53594.36
25%           1.25
50%           2.10
75%           4.15
max       38970.00
Name: Price, dtype: float64
0.95   9.95
Name: Price, dtype: float64

df_anova = df[["Country", "Price", "Quantity"]].dropna()

print(missing_data)

Customer ID    243007
Description      4382
dtype: int64

df_anova.isnull().sum()

Country     0
Price       0
Quantity    0
dtype: int64

df_anova.columns

Index(['Country', 'Price', 'Quantity'], dtype='object')

df_anova

from scipy.stats import f_oneway

# Teste para Quantidade
quantity_groups = [df['Quantity'][df['Country'] == country] for country in df['Country'].unique()]
f_statistic_quantity, p_value_quantity = f_oneway(*quantity_groups)

# Teste para Preço
price_groups = [df['Price'][df['Country'] == country] for country in df['Country'].unique()]
f_statistic_price, p_value_price = f_oneway(*price_groups)

print("Resultados ANOVA para Quantidade:")
print("Estatística F:", f_statistic_quantity)
print("P-valor:", p_value_quantity)

print("\nResultados ANOVA para Preço:")
print("Estatística F:", f_statistic_price)
print("P-valor:", p_value_price)

Resultados ANOVA para Quantidade:
Estatística F: 77.46534225726585
P-valor: 0.0

Resultados ANOVA para Preço:
Estatística F: 5.893433324083531
P-valor: 6.260090985019275e-31

# 3. Teste ANOVA para Quantidade
anova_quantity = ols('Quantity ~ C(Country)', data=df).fit()
result_quantity = sm.stats.anova_lm(anova_quantity, typ=2)
display(result_quantity)

# 3. Teste ANOVA para Preço
anova_price = ols('Price ~ C(Country)', data=df).fit()
result_price = sm.stats.anova_lm(anova_price, typ=2)
display(result_price)

anova_price = ols('Price ~ C(Country)', data=df_anova).fit()
anova_quantity = ols('Quantity ~ C(Country)', data=df_anova).fit()

residuals_price = anova_price.resid
residuals_quantity = anova_quantity.resid

shapiro_tests = {
    "Resíduos de Price": shapiro(residuals_price)[1],
    "Resíduos de Quantity": shapiro(residuals_quantity)[1]
}

levene_tests = {
    "Price por Country": levene(*[df_anova[df_anova["Country"] == country]["Price"] for country in df_anova["Country"].unique()]).pvalue,
    "Quantity por Country": levene(*[df_anova[df_anova["Country"] == country]["Quantity"] for country in df_anova["Country"].unique()]).pvalue
}

results = pd.DataFrame({
    "Teste de Normalidade (Shapiro-Wilk p-values)": shapiro_tests,
    "Teste de Homogeneidade (Levene p-values)": levene_tests
})

# Exibir os resultados
display(results)

print(f'Para o item (i) será eliminado {df_anova.shape[0] - df_anova[df_anova["Price"] > 0].shape[0]} variaveis com valores negativos.')
print(f'Representa {((df_anova.shape[0] - df_anova[df_anova["Price"] > 0].shape[0]) / df_anova.shape[0]) * 100:.2f}% do dataset.')

Para o item (i) será eliminado 6207 variaveis com valores negativos.
Representa 0.58% do dataset.

df_anova_pressupostos = df_anova.copy()
df_anova_pressupostos = df_anova_pressupostos[df_anova_pressupostos["Price"] > 0]

df_anova_pressupostos["Price"] = np.log1p(df_anova_pressupostos["Price"])
df_anova_pressupostos["Quantity"] = np.log1p(df_anova["Quantity"])
df_anova_pressupostos

anova_price = ols('Price ~ C(Country)', data=df_anova_pressupostos).fit()
anova_quantity = ols('Quantity ~ C(Country)', data=df_anova_pressupostos).fit()

residuals_price = anova_price.resid
residuals_quantity = anova_quantity.resid

shapiro_tests = {
    "Resíduos de Price": shapiro(residuals_price)[1],
    "Resíduos de Quantity": shapiro(residuals_quantity)[1]
}

levene_tests = {
    "Price por Country": levene(*[df_anova_pressupostos[df_anova_pressupostos["Country"] == country]["Price"] for country in df_anova_pressupostos["Country"].unique()]).pvalue,
    "Quantity por Country": levene(*[df_anova_pressupostos[df_anova_pressupostos["Country"] == country]["Quantity"] for country in df_anova_pressupostos["Country"].unique()]).pvalue
}

results = pd.DataFrame({
    "Teste de Normalidade com LOG (Shapiro-Wilk p-values)": shapiro_tests,
    "Teste de Homogeneidade com LOG (Levene p-values)": levene_tests
})

# Exibir os resultados
display(results)

from scipy.stats import kruskal

# Kruskal-Wallis para Price agrupado por Country
kruskal_price = kruskal(*[df_anova[df_anova['Country'] == country]['Price'] for country in df_anova['Country'].unique()])

# Kruskal-Wallis para Quantity agrupado por Country
kruskal_quantity = kruskal(*[df_anova[df_anova['Country'] == country]['Quantity'] for country in df_anova['Country'].unique()])

# Exibindo os resultados
print("Resultados do Teste de Kruskal-Wallis:")
print(f"Price por Country: H-statistic = {kruskal_price.statistic:.4f}, p-value = {kruskal_price.pvalue:.4f}")
print(f"Quantity por Country: H-statistic = {kruskal_quantity.statistic:.4f}, p-value = {kruskal_quantity.pvalue:.4f}")

Resultados do Teste de Kruskal-Wallis:
Price por Country: H-statistic = 2061.4633, p-value = 0.0000
Quantity por Country: H-statistic = 46577.0281, p-value = 0.0000

df_anova_pressupostos_np = df_anova.copy()
df_anova_pressupostos_np[df_anova_pressupostos_np["Price"] > 0]

# Kruskal-Wallis para Price agrupado por Country
kruskal_price = kruskal(*[df_anova_pressupostos_np[df_anova_pressupostos_np['Country'] == country]['Price'] for country in df_anova_pressupostos_np['Country'].unique()])

# Kruskal-Wallis para Quantity agrupado por Country
kruskal_quantity = kruskal(*[df_anova_pressupostos_np[df_anova_pressupostos_np['Country'] == country]['Quantity'] for country in df_anova_pressupostos_np['Country'].unique()])

# Exibindo os resultados
print("Resultados do Teste de Kruskal-Wallis:")
print(f"Price por Country: H-statistic = {kruskal_price.statistic:.4f}, p-value = {kruskal_price.pvalue:.4f}")
print(f"Quantity por Country: H-statistic = {kruskal_quantity.statistic:.4f}, p-value = {kruskal_quantity.pvalue:.4f}")

Resultados do Teste de Kruskal-Wallis:
Price por Country: H-statistic = 2061.4633, p-value = 0.0000
Quantity por Country: H-statistic = 46577.0281, p-value = 0.0000

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	12/1/2009 7:45	6.95	13085.00	United Kingdom
1	489434	79323P	PINK CHERRY LIGHTS	12	12/1/2009 7:45	6.75	13085.00	United Kingdom
2	489434	79323W	WHITE CHERRY LIGHTS	12	12/1/2009 7:45	6.75	13085.00	United Kingdom
3	489434	22041	RECORD FRAME 7" SINGLE SIZE	48	12/1/2009 7:45	2.10	13085.00	United Kingdom
4	489434	21232	STRAWBERRY CERAMIC TRINKET BOX	24	12/1/2009 7:45	1.25	13085.00	United Kingdom

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	12/1/2009 7:45	6.95	13085.00	United Kingdom
1	489434	79323P	PINK CHERRY LIGHTS	12	12/1/2009 7:45	6.75	13085.00	United Kingdom
2	489434	79323W	WHITE CHERRY LIGHTS	12	12/1/2009 7:45	6.75	13085.00	United Kingdom
3	489434	22041	RECORD FRAME 7" SINGLE SIZE	48	12/1/2009 7:45	2.10	13085.00	United Kingdom
4	489434	21232	STRAWBERRY CERAMIC TRINKET BOX	24	12/1/2009 7:45	1.25	13085.00	United Kingdom

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	12/1/2009 7:45	6.95	13085.00	United Kingdom
1	489434	79323P	PINK CHERRY LIGHTS	12	12/1/2009 7:45	6.75	13085.00	United Kingdom
2	489434	79323W	WHITE CHERRY LIGHTS	12	12/1/2009 7:45	6.75	13085.00	United Kingdom
3	489434	22041	RECORD FRAME 7" SINGLE SIZE	48	12/1/2009 7:45	2.10	13085.00	United Kingdom
4	489434	21232	STRAWBERRY CERAMIC TRINKET BOX	24	12/1/2009 7:45	1.25	13085.00	United Kingdom

	sum_sq	df	F	PR(>F)
C(Country)	96753300.75	42.00	77.47	0.00
Residual	31740002622.33	1067328.00	NaN	NaN

	sum_sq	df	F	PR(>F)
C(Country)	3777818.17	42.00	5.89	0.00
Residual	16290007691.24	1067328.00	NaN	NaN

Programa de Pós-graduação em Computação Aplicada – PPCA (UnB)

Análise Estatística de Dados e Informações - Prova Final

Professor: João Gabriel de Moraes Souza

Aluno: Angelo Donizete Buso Júnior

Questão 3¶

Análise de Dados do Online Retail II Data Set from ML Repository¶

1. Carregar o Dataset¶

2. Análise Exploratória de Dados¶

3. Seleção de Variáveis para ANOVA¶

4. ANOVA: Comparação de Preços Médios¶

5. Validação dos Pressupostos¶

5.1 Normalidade¶

5.2 Homogeneidade das Variâncias¶

6. Conclusões¶

	Quantity	Price	Customer ID
count	1067371.00	1067371.00	824364.00
mean	9.94	4.65	15324.64
std	172.71	123.55	1697.46
min	-80995.00	-53594.36	12346.00
25%	1.00	1.25	13975.00
50%	3.00	2.10	15255.00
75%	10.00	4.15	16797.00
max	80995.00	38970.00	18287.00

	Teste de Normalidade (Shapiro-Wilk p-values)	Teste de Homogeneidade (Levene p-values)
Resíduos de Price	0.00	NaN
Resíduos de Quantity	0.00	NaN
Price por Country	NaN	0.00
Quantity por Country	NaN	0.00

	Country	Price	Quantity
0	United Kingdom	2.07	2.56
1	United Kingdom	2.05	2.56
2	United Kingdom	2.05	2.56
3	United Kingdom	1.13	3.89
4	United Kingdom	0.81	3.22
...	...	...	...
1067366	France	1.13	1.95
1067367	France	1.64	1.61
1067368	France	1.64	1.61
1067369	France	1.78	1.39
1067370	France	2.94	0.69