# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Load and preview data
shopping_data = pd.read_csv("online_shopping_session_data.csv")
shopping_data.head()

# Subset dataframe for November and December data
shopping_Nov_Dec = shopping_data[shopping_data['Month'].isin(['Nov', 'Dec'])]

# Preview to make sure the subset is correct
shopping_Nov_Dec.head()

# Make sure we only have November and December data
shopping_Nov_Dec['Month'].unique()

array(['Nov', 'Dec'], dtype=object)

# Get session frequency stats by CustomerType and Purchase
count_session = shopping_Nov_Dec.groupby(['CustomerType'])['Purchase'].value_counts()
count_session

CustomerType        Purchase
New_Customer        0.0          529
                    1.0          199
Returning_Customer  0.0         2994
                    1.0          728
Name: count, dtype: int64

# Total number of session by CustomerType
total_new_customer = np.sum(count_session['New_Customer'])
total_returning_customer = np.sum(count_session['Returning_Customer'])


# Total number of purchase by CustomerType
purchase_new_customer = count_session[('New_Customer', 1)]
purchase_returning_customer = count_session[('Returning_Customer', 1)]


# Calculate purchase rates
purchase_rate_new = purchase_new_customer / total_new_customer
purchase_rate_returning = purchase_returning_customer / total_returning_customer


# view the results 
purchase_rates = {"Returning_Customer": purchase_rate_returning, "New_Customer": purchase_rate_new}
purchase_rates

{'Returning_Customer': 0.1955937667920473, 'New_Customer': 0.2733516483516483}

# Calculate correlation with pandas
cor_admin_info = shopping_Nov_Dec['Administrative_Duration'].corr(shopping_Nov_Dec['Informational_Duration'])
cor_admin_product = shopping_Nov_Dec['Administrative_Duration'].corr(shopping_Nov_Dec['ProductRelated_Duration'])
cor_product_info = shopping_Nov_Dec['ProductRelated_Duration'].corr(shopping_Nov_Dec['Informational_Duration'])

print(cor_admin_info)
print(cor_admin_product)
print(cor_product_info)

0.2446885579283925
0.38985460032069624
0.36712552534442094

# Store top correlation
top_correlation = {"pair": ('Administrative_Duration', 'ProductRelated_Duration'), "correlation": cor_admin_product}
print(top_correlation)

{'pair': ('Administrative_Duration', 'ProductRelated_Duration'), 'correlation': 0.38985460032069624}

# We know that the current purchase rate for the returning customers is
print("Current purchase rate for the returning customer:", purchase_rate_returning)

Current purchase rate for the returning customer: 0.1955937667920473

# 15% Increase in this rate would be
increased_purchase_rate_returning = 1.15 * purchase_rate_returning
print("Increased purchase rate for the returning customer:", increased_purchase_rate_returning)

Increased purchase rate for the returning customer: 0.22493283181085436

#  likelihood of having <100 sales of 500 sessions
prob_sales_100_less = stats.binom.cdf(k=100, n=500, p=increased_purchase_rate_returning)
print("probability of having <100 sales:", prob_sales_100_less)

probability of having <100 sales: 0.09877786609627338

# probability of having 100 or more sales is 1-prob_sales_100_less
prob_at_least_100_sales = 1 - prob_sales_100_less
print("probability of having at least 100 sales:", prob_at_least_100_sales)

probability of having at least 100 sales: 0.9012221339037266

# Plotting the binomial probability distribution
n_sessions = 500
k_values = np.arange(500) + 1
p_binom_values = [stats.binom.pmf(k, n_sessions, increased_purchase_rate_returning) for k in k_values ] 
plt.bar(k_values, p_binom_values) 
plt.vlines(100, 0, 0.08, color='r', linestyle='dashed', label="sales=100")
plt.xlabel("number of sales")
plt.ylabel("probability")
plt.legend()
plt.show()

Column	Description
`SessionID`	A unique ID for each shopping session
`Administrative`	Number of pages visited related to customer account settings or admin-related features
`Administrative_Duration`	Total time (in seconds) spent on administrative pages
`Informational`	Number of pages viewed that provide information about the website or company
`Informational_Duration`	Total time (in seconds) spent on informational pages
`ProductRelated`	Number of product-related pages visited
`ProductRelated_Duration`	Total time (in seconds) spent on product-related pages
`BounceRates`	The average bounce rate (when customers leave after viewing one page) during the session
`ExitRates`	The average rate at which users exit from the pages they visit
`PageValues`	A value score for the pages visited, based on previous user activity and conversions
`SpecialDay`	How close the session date is to a special shopping day (like Black Friday)
`Weekend`	Whether the session occurred on a weekend (True or False)
`Month`	The month in which the session took place
`CustomerType`	Indicates whether the customer is a returning visitor or a new visitor
`Purchase`	Indicates whether a purchase was made in the session (True or False)

	SessionID	Administrative	Administrative_Duration	Informational	Informational_Duration	ProductRelated	ProductRelated_Duration	BounceRates	ExitRates	PageValues	Weekend	Month	CustomerType	Purchase
5463	5464	1	39.2	2	120.8	7	80.500000	0.000000	0.010000	0.000000	True	Nov	New_Customer	0.0
5464	5465	3	89.6	0	0.0	57	1721.906667	0.000000	0.005932	204.007949	True	Nov	Returning_Customer	1.0
5467	5468	4	204.2	0	0.0	31	652.376667	0.012121	0.016162	0.000000	False	Nov	Returning_Customer	0.0
5479	5480	0	0.0	0	0.0	13	710.066667	0.000000	0.007692	72.522838	False	Nov	Returning_Customer	1.0
5494	5495	0	0.0	0	0.0	24	968.692424	0.000000	0.000000	106.252517	False	Nov	Returning_Customer	1.0

Project Description¶

Dataset Description¶

What are the purchase rates for online shopping sessions by customer type for November and December?¶

What is the strongest correlation in total time spent among page types by returning customers in November and December?¶

A new campaign for the returning customers will boost the purchase rate by 15%. What is the likelihood of achieving at least 100 sales out of 500 online shopping sessions for the returning customers?¶

	SessionID	ProductRelated	ProductRelated_Duration	BounceRates	ExitRates	Weekend	Month	CustomerType
0	1	1	0.000000	0.20	0.20	False	Feb	Returning_Customer
1	2	2	64.000000	0.00	0.10	False	Feb	Returning_Customer
2	3	1	0.000000	0.20	0.20	False	Feb	Returning_Customer
3	4	2	2.666667	0.05	0.14	False	Feb	Returning_Customer
4	5	10	627.500000	0.02	0.05	True	Feb	Returning_Customer