import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()
from scipy.stats import boxcox

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA


data = pd.read_csv("ecomdata.csv", encoding="ISO-8859-1")
data.shape

(541909, 8)


data.head(3)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


data_size=data.shape
missing_percentage = data.isnull().sum() / data.shape[0] * 100
missing_percentage

InvoiceNo       0.000000
StockCode       0.000000
Description     0.268311
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     24.926694
Country         0.000000
dtype: float64


sns.heatmap(data.isnull(),yticklabels=False,cbar=False)

<Axes: >


data.loc[data.Description.isnull()].count()

InvoiceNo      1454
StockCode      1454
Description       0
Quantity       1454
InvoiceDate    1454
UnitPrice      1454
CustomerID        0
Country        1454
dtype: int64


data.loc[data.Description.isnull(),:].count()

InvoiceNo      1454
StockCode      1454
Description       0
Quantity       1454
InvoiceDate    1454
UnitPrice      1454
CustomerID        0
Country        1454
dtype: int64


data.loc[(data.Description.isnull()) & (data.UnitPrice<=0),:]


data.loc[data.CustomerID.isnull(), ["UnitPrice", "Quantity"]].describe()


data.loc[data.Description.isnull()==False, "lowercase_descriptions"] = data.loc[
    data.Description.isnull()==False,"Description"
].apply(lambda l: l.lower())

data.lowercase_descriptions.dropna().apply(
    lambda l: np.where("nan" in l, True, False)
).value_counts()

lowercase_descriptions
False    539724
True        731
Name: count, dtype: int64


data.lowercase_descriptions.dropna().apply(
    lambda l: np.where("" == l, True, False)
).value_counts()

lowercase_descriptions
False    540455
Name: count, dtype: int64


data.loc[data.lowercase_descriptions.isnull()==False, "lowercase_descriptions"] = data.loc[
    data.lowercase_descriptions.isnull()==False, "lowercase_descriptions"
].apply(lambda l: np.where("nan" in l, None, l))

data.loc[:,["Description","lowercase_descriptions"]].count()

Description               540455
lowercase_descriptions    539724
dtype: int64


data["InvoiceDate"] = pd.to_datetime(data.InvoiceDate, cache=True)

data.InvoiceDate.max() - data.InvoiceDate.min()

Timedelta('373 days 04:24:00')


print("Datafile starts with timepoint {}".format(data.InvoiceDate.min()))
print("Datafile ends with timepoint {}".format(data.InvoiceDate.max()))

Datafile starts with timepoint 2010-12-01 08:26:00
Datafile ends with timepoint 2011-12-09 12:50:00


data.InvoiceNo.nunique()

25900


data["IsCancelled"]=np.where(data.InvoiceNo.apply(lambda l: l[0]=="C"), True, False)
data.IsCancelled.value_counts() / data.shape[0] * 100

IsCancelled
False    98.286059
True      1.713941
Name: count, dtype: float64


data.loc[data.IsCancelled==True].describe()


data = data.loc[data.IsCancelled==False].copy()
data = data.drop("IsCancelled", axis=1)
data.shape

(532621, 9)


data.drop("CustomerID",axis=1)


data.loc[data.Quantity<=0,:].count()

InvoiceNo                 1336
StockCode                 1336
Description                474
Quantity                  1336
InvoiceDate               1336
UnitPrice                 1336
CustomerID                   0
Country                   1336
lowercase_descriptions     474
dtype: int64


data = data.loc[data.Quantity>0].copy()
data.shape

(531285, 9)


data.loc[data.UnitPrice<=0,:].count()

InvoiceNo                 1181
StockCode                 1181
Description                589
Quantity                  1181
InvoiceDate               1181
UnitPrice                 1181
CustomerID                  40
Country                   1181
lowercase_descriptions     589
dtype: int64


data = data.loc[data.UnitPrice>0].copy()
data.shape

(530104, 9)


data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 530104 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   InvoiceNo               530104 non-null  object        
 1   StockCode               530104 non-null  object        
 2   Description             530104 non-null  object        
 3   Quantity                530104 non-null  int64         
 4   InvoiceDate             530104 non-null  datetime64[ns]
 5   UnitPrice               530104 non-null  float64       
 6   CustomerID              397884 non-null  float64       
 7   Country                 530104 non-null  object        
 8   lowercase_descriptions  529382 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 40.4+ MB


data.StockCode.nunique()

3922


stockcode_counts = data.StockCode.value_counts().sort_values(ascending=False)
fig, ax = plt.subplots(2,1,figsize=(20,15))
sns.barplot(x=stockcode_counts.iloc[0:20].index,
            y=stockcode_counts.iloc[0:20].values,
            alpha=0.8,
            ax = ax[0], palette="Oranges_r")
ax[0].set_ylabel("Counts")
ax[0].set_xlabel("Stockcode")
ax[0].set_title("Which stockcodes are most common?");
sns.distplot(np.round(stockcode_counts/data.shape[0]*100,2),
             kde=False,
             bins=20,
             ax=ax[1], color="Orange")
ax[1].set_title("How seldom are stockcodes?")
ax[1].set_xlabel("% of data with this stockcode")
ax[1].set_ylabel("Frequency");


data.Description.nunique()

4026


description_counts = data.Description.value_counts().sort_values(ascending=False).iloc[0:30]
plt.figure(figsize=(20,5))
sns.barplot(x=description_counts.index, y=description_counts.values, palette="Purples_r")
plt.ylabel("Counts")
plt.title("Which product descriptions are most common?");
plt.xticks(rotation=90);


def count_lower_chars(l):
    return sum(1 for c in l if c.islower())


#data["DescriptionLength"] = data.Description.apply(lambda l: len(l))
data["LowCharsInDescription"] = data.Description.apply(lambda l: count_lower_chars(l))


lowchar_counts = data.loc[data.LowCharsInDescription > 0].Description.value_counts()

plt.figure(figsize=(15,3))
sns.barplot(x=lowchar_counts.index, y=lowchar_counts.values, palette="Purples_r")
plt.xticks(rotation=90);


data.groupby("StockCode").Description.nunique().sort_values(ascending=False).iloc[0:10]

StockCode
23236     4
23196     4
17107D    3
23366     3
23203     3
23370     3
23126     3
23244     3
23413     3
23131     3
Name: Description, dtype: int64


data.loc[data.StockCode == "23244"].Description.value_counts()

Description
ROUND STORAGE TIN VINTAGE LEAF    103
STORAGE TIN VINTAGE LEAF            7
CANNISTER VINTAGE LEAF DESIGN       2
Name: count, dtype: int64


data.UnitPrice.describe()

count    530104.000000
mean          3.907625
std          35.915681
min           0.001000
25%           1.250000
50%           2.080000
75%           4.130000
max       13541.330000
Name: UnitPrice, dtype: float64


np.quantile(data.UnitPrice, 0.999)

165.0


data = data.loc[(data.UnitPrice > 0.001) & (data.UnitPrice < 200)].copy()


data.Quantity.describe()

count    529710.000000
mean         10.549021
std         155.581721
min           1.000000
25%           1.000000
50%           3.000000
75%          10.000000
max       80995.000000
Name: Quantity, dtype: float64


fig, ax = plt.subplots(1,2,figsize=(20,5))
sns.distplot(data.Quantity, ax=ax[0], kde=False, color="limegreen");
sns.distplot(np.log(data.Quantity), ax=ax[1], bins=20, kde=False, color="limegreen");
ax[0].set_title("Quantity distribution")
ax[0].set_yscale("log")
ax[1].set_title("Log-Quantity distribution")
ax[1].set_xlabel("Natural-Log Quantity");


np.exp(4)

54.598150033144236


np.quantile(data.Quantity,0.95)

30.0


np.quantile(data.Quantity, 0.999)

451.74599999981


data = data.loc[data.Quantity <=452].copy()


plt.figure(figsize=(20,5))
plt.plot(data.groupby("InvoiceDate").Quantity.sum(), marker='+', c="darkorange", label='Original')
plt.plot(data.groupby("InvoiceDate").Quantity.sum().rolling(window=30, center=True).mean(),c="red", label='Rolling Mean')
plt.xticks(rotation=90)
plt.legend(loc='best')
plt.title("How many quantities are sold per day over the given time?");


#Perform Augmented ADCF test:
print('Results of ADCF Test:')
dftest = adfuller(data.Quantity, autolag='AIC')

dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
dfoutput

Results of ADCF Test:

Test Statistic                    -58.114860
p-value                             0.000000
#Lags Used                         87.000000
Number of Observations Used    529092.000000
Critical Value (1%)                -3.430362
Critical Value (5%)                -2.861545
Critical Value (10%)               -2.566773
dtype: float64


data["Revenue"] = data.Quantity * data.UnitPrice
data["Quarter"] = data.InvoiceDate.dt.quarter
data["Year"] = data.InvoiceDate.dt.year
data["Month"] = data.InvoiceDate.dt.month
data["Weekday"] = data.InvoiceDate.dt.weekday

data.Revenue.describe()

count    529180.000000
mean         18.196149
std          47.679269
min           0.060000
25%           3.750000
50%           9.900000
75%          17.700000
max        3285.000000
Name: Revenue, dtype: float64


np.quantile(data.Revenue, 0.99)

179.0


grouped_features = ["Year", "Quarter"]
rdata = pd.DataFrame(data.groupby(grouped_features).Revenue.sum(),
                          columns=["Revenue"])
rdata["Quantity"] = data.groupby(grouped_features).Quantity.sum()
rdata.head(5)


fig, ax = plt.subplots(1,2,figsize=(20,5))

weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
yearmonth = ["Dec-2010", "Jan-2011", "Feb-2011", "Mar-2011", "Apr-2011", "May-2011",
             "Jun-2011", "Jul-1011", "Aug-2011", "Sep-2011", "Oct-2011", "Nov-2011", "Dec-2011"]

ax[0].plot(data.groupby("Weekday").Revenue.sum(),marker='o', label="Revenue", c="darkorange")
ax[0].legend();
ax[0].set_xticks(np.arange(0,7))
ax[0].set_xticklabels(weekdays);
ax[0].set_title("Revenue per weekday");

ax[1].plot(data.groupby(["Year", "Month"]).Revenue.sum().values, marker='o', label="Monthly Revenue", c="darkorange")
ax[1].set_xticklabels(yearmonth, rotation=90)
ax[1].set_xticks(np.arange(0, len(yearmonth)))
ax[1].legend();
ax[1].set_title("Revenue per month");


data.Country.nunique()

38


country_counts = data.Country.value_counts().sort_values(ascending=False).iloc[0:20]
plt.figure(figsize=(20,5))
sns.barplot(x=country_counts.index, y=country_counts.values, palette="Greens_r", alpha=0.8)
plt.ylabel("Counts")
plt.title("Which countries made the most transactions?");
plt.xticks(rotation=90);
plt.yscale("log")


data.loc[data.Country=="United Kingdom"].shape[0] / data.shape[0] * 100

91.5197097395971


data["UK"] = np.where(data.Country == "United Kingdom", 1, 0)


country_counts = data.groupby('Country')['Revenue'].mean().sort_values(ascending=False).iloc[0:20]
plt.figure(figsize=(20,5))
sns.barplot(x=country_counts.index, y=country_counts.values, palette="GnBu_d")
plt.ylabel("Counts")
plt.title("According to each Country, What is the Average Invoice Value");
plt.xticks(rotation=90);


country_counts = data[data.Revenue>=179].groupby('Country')['Revenue'].mean().sort_values(ascending=False).iloc[0:20]
plt.figure(figsize=(20,5))
sns.barplot(x=country_counts.index, y=country_counts.values, palette="GnBu_d")
plt.ylabel("Counts")
plt.title("Countries with top 1% Revenue Sales");
plt.xticks(rotation=90);


data.Quantity.describe()

count    529180.000000
mean          9.437792
std          21.860481
min           1.000000
25%           1.000000
50%           3.000000
75%          10.000000
max         450.000000
Name: Quantity, dtype: float64


data.UnitPrice.describe()

count    529180.000000
mean          3.412001
std           5.694932
min           0.040000
25%           1.250000
50%           2.080000
75%           4.130000
max         199.890000
Name: UnitPrice, dtype: float64


country_counts = data.Country.value_counts().sort_values(ascending=False).iloc[0:10]/data.shape[0]*100
country_counts

Country
United Kingdom    91.519710
Germany            1.706603
France             1.587740
EIRE               1.489285
Spain              0.469405
Netherlands        0.443327
Belgium            0.383801
Switzerland        0.371518
Portugal           0.282513
Australia          0.221097
Name: count, dtype: float64


country_counts_avg_rev = data.groupby('Country').Revenue.mean().sort_values(ascending=False).iloc[0:7]
country_counts_avg_rev

Country
Netherlands    115.033836
Australia      112.955444
Japan           83.488951
Sweden          75.197258
Denmark         49.882474
Lithuania       47.458857
Singapore       42.976806
Name: Revenue, dtype: float64


from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
d=pd.DataFrame()
data['Description'] = label.fit_transform(data['Description'])
data['Country'] = label.fit_transform(data['Country'])


data_scaled = data.copy().drop(['lowercase_descriptions','InvoiceDate','LowCharsInDescription','UK','StockCode',
                                'InvoiceNo'],axis = 1)
data_scaled.loc[data_scaled.CustomerID.isnull(),'CustomerID']=0.0
data_scaled.describe()


data_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 529180 entries, 0 to 541908
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Description  529180 non-null  int64  
 1   Quantity     529180 non-null  int64  
 2   UnitPrice    529180 non-null  float64
 3   CustomerID   529180 non-null  float64
 4   Country      529180 non-null  int64  
 5   Revenue      529180 non-null  float64
 6   Quarter      529180 non-null  int32  
 7   Year         529180 non-null  int32  
 8   Month        529180 non-null  int32  
 9   Weekday      529180 non-null  int32  
dtypes: float64(3), int32(4), int64(3)
memory usage: 36.3 MB


scaler = StandardScaler()
dfs=pd.DataFrame(scaler.fit_transform(data_scaled))
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529180 entries, 0 to 529179
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       529180 non-null  float64
 1   1       529180 non-null  float64
 2   2       529180 non-null  float64
 3   3       529180 non-null  float64
 4   4       529180 non-null  float64
 5   5       529180 non-null  float64
 6   6       529180 non-null  float64
 7   7       529180 non-null  float64
 8   8       529180 non-null  float64
 9   9       529180 non-null  float64
dtypes: float64(10)
memory usage: 40.4 MB


iris = datasets.load_iris()
X = iris.data
y = iris.target
#
# Do the scatter plot and see that clusters are evident
#
plt.scatter(X[:,1], X[:,3],color='white', marker='o', edgecolor='red', s=50)
plt.grid()
plt.tight_layout()
plt.show()


#
# Create an instance of K-Means
#
kmc = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300,tol=1e-04, random_state=0)
#
# Fit and make predictions
#
y_kmc = kmc.fit_predict(X)
#
# Create the K-means cluster plot
#
plt.scatter(X[y_kmc == 0, 1], X[y_kmc == 0, 3], s=50,
            c='lightgreen', marker='s', edgecolor='black', label='Cluster 1')
plt.scatter(X[y_kmc == 1, 1], X[y_kmc == 1, 3],
             s=50, c='orange', marker='o', edgecolor='black', label='Cluster 2')
plt.scatter(X[y_kmc == 2, 1], X[y_kmc == 2, 3], s=50,
            c='blue', marker='P', edgecolor='black', label='Cluster 3')
plt.scatter(kmc.cluster_centers_[:, 1], kmc.cluster_centers_[:, 3],
            s=250, marker='*', c='red', edgecolor='black', label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.show()


wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


kmc = KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300,tol=1e-04)
y_kmc = kmc.fit_predict(X)

plt.scatter(X[y_kmc == 0, 1], X[y_kmc == 0, 3], c='lightgreen', marker='s', edgecolor='black', label='Cluster 1')
plt.scatter(X[y_kmc == 1, 1], X[y_kmc == 1, 3], c='orange', marker='o', edgecolor='black', label='Cluster 2')
plt.scatter(X[y_kmc == 2, 1], X[y_kmc == 2, 3], c='blue', marker='2', edgecolor='black', label='Cluster 3')
plt.scatter(X[y_kmc == 3, 1], X[y_kmc == 3, 3], c='blue', marker='P', edgecolor='black', label='Cluster 4')
plt.scatter(X[y_kmc == 4, 1], X[y_kmc == 4, 3], c='blue', marker='+', edgecolor='black', label='Cluster 5')
plt.scatter(kmc.cluster_centers_[:, 1], kmc.cluster_centers_[:, 3],
            s=250, marker='*', c='red', edgecolor='black', label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.show()


wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', n_init=5)
    kmeans.fit(dfs)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


#
# Create an instance of K-Means
#
kmc = KMeans(n_clusters=7, init='k-means++', n_init=10, max_iter=300,tol=1e-04)
#
# Fit and make predictions
#
y_kmc = kmc.fit_predict(dfs)


data_scaled['cluster']=y_kmc
dfs['cluster']=y_kmc
dfs


# Create the K-means cluster plot
#
plt.figure(figsize=(10,8))
sns.scatterplot(x = 0,y = 6, data = dfs, hue = 'cluster',palette='deep')
plt.title('Product (Description) vs Quarter')
plt.xlabel('Product')
plt.ylabel('Quarter')
plt.show()


plt.figure(figsize=(20,8))
sns.heatmap(dfs.corr(),annot=True)
plt.title('Heatmap of Correlations')
plt.show()


plt.figure(figsize=(15,10))
sns.scatterplot(y = 5,x='cluster', data = dfs, hue = 'cluster',palette='deep')
plt.title('Revenue vs Cluster')
plt.show()


plt.figure(figsize=(10,5))
sns.scatterplot(y=5,x=4, data = dfs, hue = 'cluster',palette='deep')
plt.title('Revenue vs Country')
plt.ylabel('Revenue')
plt.show()


plt.figure(figsize=(10,5))
sns.scatterplot(y=5,x=6, data = dfs, hue = 'cluster',palette='deep')
plt.title('Revenue vs Quarter')
plt.ylabel('Revenue')
plt.show()


products = pd.DataFrame(index=data.StockCode.unique(), columns = ["MedianPrice"])

products["MedianPrice"] = data.groupby("StockCode").UnitPrice.median()
products["MedianQuantities"] = data.groupby("StockCode").Quantity.median()
products["Customers"] = data.groupby("StockCode").CustomerID.nunique()
org_cols = np.copy(products.columns.values)
products.head()


dsp=products.copy()
dsp2=pd.DataFrame(scaler.fit_transform(dsp))
dsp2[[2]]=scaler.fit_transform(dsp2[[2]])
dsp.info()
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(dsp)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

<class 'pandas.core.frame.DataFrame'>
Index: 3917 entries, 85123A to 47591b
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MedianPrice       3917 non-null   float64
 1   MedianQuantities  3917 non-null   float64
 2   Customers         3917 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 122.4+ KB


kmp = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300,tol=1e-04)
y_kmp = kmp.fit_predict(dsp2)
dsp2['cluster']=y_kmp


plt.scatter(np.array(dsp2.loc[dsp2.cluster == 0, 0]), np.array(dsp2.loc[dsp2.cluster == 0, 2]),
            c='green', marker='+', edgecolor='black', label='Cluster 1')
plt.scatter(np.array(dsp2.loc[dsp2.cluster == 1, 0]), np.array(dsp2.loc[dsp2.cluster == 1, 2]),
            c='orange', marker='o', edgecolor='black', label='Cluster 2')
plt.scatter(np.array(dsp2.loc[dsp2.cluster == 2, 0]), np.array(dsp2.loc[dsp2.cluster == 2, 2]),
            c='blue', marker='s', edgecolor='black', label='Cluster 3')
plt.scatter(kmp.cluster_centers_[:, 0], kmp.cluster_centers_[:, 2],
            s=250, marker='*', c='red', edgecolor='black', label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.xlabel('MedianPrice')
plt.ylabel('Unique Customers')
plt.title('Product Clusters')
plt.show()


dsp.describe()


dsn=data_scaled.copy().drop(['Quarter','Year','UnitPrice','Country','CustomerID','Weekday','cluster','Month'],axis=1)
ds2=pd.DataFrame(scaler.fit_transform(dsn))
dsn.info()
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(ds2)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

kmc2n = KMeans(n_clusters=4, init='k-means++', n_init=15, max_iter=300,tol=1e-04)
y_kmc2n = kmc2n.fit_predict(ds2)
ds2['cluster']=y_kmc2n

<class 'pandas.core.frame.DataFrame'>
Index: 529180 entries, 0 to 541908
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Description  529180 non-null  int64  
 1   Quantity     529180 non-null  int64  
 2   Revenue      529180 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 16.1 MB


plt.scatter(np.array(ds2.loc[ds2.cluster == 0, 2]), np.array(ds2.loc[ds2.cluster == 0, 1]),
            c='blue', marker='s', edgecolor='black', label='Cluster 1')
plt.scatter(np.array(ds2.loc[ds2.cluster == 1, 2]), np.array(ds2.loc[ds2.cluster == 1, 1]),
            c='orange', marker='o', edgecolor='black', label='Cluster 2')
plt.scatter(np.array(ds2.loc[ds2.cluster == 2, 2]), np.array(ds2.loc[ds2.cluster == 2, 1]),
            c='lightgreen', marker='2', edgecolor='black', label='Cluster 3')
plt.scatter(np.array(ds2.loc[ds2.cluster == 3, 2]), np.array(ds2.loc[ds2.cluster == 3, 1]),
            c='blue', marker='+', edgecolor='black', label='Cluster 4')
plt.scatter(kmc2n.cluster_centers_[:, 2], kmc2n.cluster_centers_[:, 1],
            s=250, marker='*', c='red', edgecolor='black', label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.show()


dsn3=data_scaled.copy().drop(['Quarter','Year','UnitPrice','Country','Month','Weekday','CustomerID','Description','cluster'],axis=1)
ds3=pd.DataFrame(scaler.fit_transform(dsn3))
dsn3.info()
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(ds3)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

<class 'pandas.core.frame.DataFrame'>
Index: 529180 entries, 0 to 541908
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Quantity  529180 non-null  int64  
 1   Revenue   529180 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 12.1 MB


km3 = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300,tol=1e-04)
y_km3 = km3.fit_predict(ds3)
ds3['cluster']=y_km3


plt.scatter(np.array(ds3.loc[ds3.cluster == 0, 1]), np.array(ds3.loc[ds3.cluster == 0, 0]),
            c='lightgreen', marker='s', edgecolor='black', label='Cluster 1')
plt.scatter(np.array(ds3.loc[ds3.cluster == 1, 1]), np.array(ds3.loc[ds3.cluster == 1, 0]),
            c='orange', marker='o', edgecolor='black', label='Cluster 2')
plt.scatter(np.array(ds3.loc[ds3.cluster == 2, 1]), np.array(ds3.loc[ds3.cluster == 2, 0]),
            c='blue', marker='2', edgecolor='black', label='Cluster 3')
plt.scatter(km3.cluster_centers_[:, 1], km3.cluster_centers_[:, 0],
            s=250, marker='*', c='red', edgecolor='black', label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.show()

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
622	536414	22139	NaN	56	12/1/2010 11:52	0.0	NaN	United Kingdom
1970	536545	21134	NaN	1	12/1/2010 14:32	0.0	NaN	United Kingdom
1971	536546	22145	NaN	1	12/1/2010 14:33	0.0	NaN	United Kingdom
1972	536547	37509	NaN	1	12/1/2010 14:33	0.0	NaN	United Kingdom
1987	536549	85226A	NaN	1	12/1/2010 14:34	0.0	NaN	United Kingdom
...	...	...	...	...	...	...	...	...
535322	581199	84581	NaN	-2	12/7/2011 18:26	0.0	NaN	United Kingdom
535326	581203	23406	NaN	15	12/7/2011 18:31	0.0	NaN	United Kingdom
535332	581209	21620	NaN	6	12/7/2011 18:35	0.0	NaN	United Kingdom
536981	581234	72817	NaN	27	12/8/2011 10:33	0.0	NaN	United Kingdom
538554	581408	85175	NaN	20	12/8/2011 14:06	0.0	NaN	United Kingdom

	Quantity	InvoiceDate	UnitPrice	CustomerID
count	9288.000000	9288	9288.000000	8905.000000
mean	-29.885228	2011-06-26 03:42:05.943152640	48.393661	14991.667266
min	-80995.000000	2010-12-01 09:41:00	0.010000	12346.000000
25%	-6.000000	2011-03-21 16:15:00	1.450000	13510.000000
50%	-2.000000	2011-07-07 17:33:30	2.950000	14895.000000
75%	-1.000000	2011-10-06 20:36:00	5.950000	16393.000000
max	-1.000000	2011-12-09 11:58:00	38970.000000	18282.000000
std	1145.786965	NaN	666.600430	1706.772357

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	Country	lowercase_descriptions
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	United Kingdom	white hanging heart t-light holder
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	United Kingdom	white metal lantern
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	United Kingdom	cream cupid hearts coat hanger
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	United Kingdom	knitted union flag hot water bottle
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	United Kingdom	red woolly hottie white heart.
...	...	...	...	...	...	...	...	...
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	2011-12-09 12:50:00	0.85	France	pack of 20 spaceboy napkins
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	2011-12-09 12:50:00	2.10	France	children's apron dolly girl
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	2011-12-09 12:50:00	4.15	France	childrens cutlery dolly girl
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	2011-12-09 12:50:00	4.15	France	childrens cutlery circus parade
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	2011-12-09 12:50:00	4.95	France	baking set 9 piece retrospot

		Revenue	Quantity
Year	Quarter
2010	4	748299.07	327134
2011	1	1734218.55	887696
	2	1895424.63	993970
	3	2354141.32	1288495
	4	2896954.50	1496996

Customer Segmentation - Ecommerce Data¶

Table of contents¶

1. Prepare to start ¶

2. Get familiar with the data ¶

3. Get an initial feeling for the data by exploration ¶

Missing values ¶

The Time period ¶

The invoice number ¶

Handling Unwanted Data ¶

Stockcodes ¶

Descriptions ¶

Unit Price ¶

Quantities ¶

Revenues ¶

Countries ¶

Conclusion ¶

5. Clustering ¶

	UnitPrice	Quantity
count	135080.000000	135080.000000
mean	8.076577	1.995573
std	151.900816	66.696153
min	-11062.060000	-9600.000000
25%	1.630000	1.000000
50%	3.290000	1.000000
75%	5.450000	3.000000
max	17836.460000	5568.000000

	Description	Quantity	UnitPrice	CustomerID	Country	Revenue	Quarter	Year	Month	Weekday
count	529180.000000	529180.000000	529180.000000	529180.000000	529180.000000	529180.000000	529180.000000	529180.000000	529180.000000	529180.000000
mean	2078.176917	9.437792	3.412001	11482.025226	34.277463	18.196149	2.837163	2010.921760	7.561055	2.437388
std	1125.764552	21.860481	5.694932	6780.238419	6.192548	47.679269	1.136980	0.268549	3.508484	1.851140
min	0.000000	1.000000	0.040000	0.000000	0.000000	0.060000	1.000000	2010.000000	1.000000	0.000000
25%	1166.000000	1.000000	1.250000	12354.000000	36.000000	3.750000	2.000000	2011.000000	5.000000	1.000000
50%	2037.000000	3.000000	2.080000	14388.000000	36.000000	9.900000	3.000000	2011.000000	8.000000	2.000000
75%	3058.000000	10.000000	4.130000	16265.000000	36.000000	17.700000	4.000000	2011.000000	11.000000	4.000000
max	4019.000000	450.000000	199.890000	18287.000000	37.000000	3285.000000	4.000000	2011.000000	12.000000	6.000000

	0	1	2	3	4	5	6	7	8	9	cluster
0	1.563226	-0.157261	-0.151363	0.939197	0.278163	-0.060742	1.022742	-3.432375	1.265204	-0.236281	6
1	1.570333	-0.157261	-0.003863	0.939197	0.278163	0.044964	1.022742	-3.432375	1.265204	-0.236281	6
2	-1.058994	-0.065771	-0.116244	0.939197	0.278163	0.079780	1.022742	-3.432375	1.265204	-0.236281	6
3	-0.196468	-0.157261	-0.003863	0.939197	0.278163	0.044964	1.022742	-3.432375	1.265204	-0.236281	6
4	0.680270	-0.157261	-0.003863	0.939197	0.278163	0.044964	1.022742	-3.432375	1.265204	-0.236281	6
...	...	...	...	...	...	...	...	...	...	...	...
529175	0.213920	0.117207	-0.449874	0.176686	-3.435982	-0.167707	1.022742	0.291343	1.265204	0.844136	5
529176	-1.210003	-0.157261	-0.230381	0.176686	-3.435982	-0.117371	1.022742	0.291343	1.265204	0.844136	5
529177	-1.204673	-0.248750	0.129589	0.176686	-3.435982	-0.033477	1.022742	0.291343	1.265204	0.844136	5
529178	-1.205561	-0.248750	0.129589	0.176686	-3.435982	-0.033477	1.022742	0.291343	1.265204	0.844136	5
529179	-1.597295	-0.294495	0.270065	0.176686	-3.435982	-0.070180	1.022742	0.291343	1.265204	0.844136	5

	MedianPrice	MedianQuantities	Customers
85123A	2.95	6.0	855
71053	3.75	4.0	149
84406B	4.15	4.0	133
84029G	4.25	4.0	211
84029E	4.25	4.0	228

	MedianPrice	MedianQuantities	Customers
count	3917.000000	3917.000000	3917.000000
mean	3.669709	4.570079	68.062293
std	7.286837	7.103047	90.374401
min	0.040000	1.000000	0.000000
25%	1.250000	1.000000	7.000000
50%	2.100000	2.000000	35.000000
75%	4.130000	5.000000	94.000000
max	165.000000	144.000000	881.000000