# Import required libraries and methods/functions
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load data
telco_demog = pd.read_csv('telecom_demographics.csv')
telco_usage = pd.read_csv('telecom_usage.csv')

telco_demog.head()

telco_usage.head()

# Join data
churn_df = telco_demog.merge(telco_usage, on='customer_id')
churn_df.head()

# Identify churn rate
churn_rate = churn_df['churn'].value_counts() / len(churn_df)
print(churn_rate)

churn
0    0.799538
1    0.200462
Name: count, dtype: float64

# Identify categorical variables
print(churn_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         6500 non-null   int64 
 1   telecom_partner     6500 non-null   object
 2   gender              6500 non-null   object
 3   age                 6500 non-null   int64 
 4   state               6500 non-null   object
 5   city                6500 non-null   object
 6   pincode             6500 non-null   int64 
 7   registration_event  6500 non-null   object
 8   num_dependents      6500 non-null   int64 
 9   estimated_salary    6500 non-null   int64 
 10  calls_made          6500 non-null   int64 
 11  sms_sent            6500 non-null   int64 
 12  data_used           6500 non-null   int64 
 13  churn               6500 non-null   int64 
dtypes: int64(9), object(5)
memory usage: 711.1+ KB
None

# One Hot Encoding for categorical variables
churn_df = pd.get_dummies(churn_df, columns=['telecom_partner', 'gender', 'state', 'city', 'registration_event'])
churn_df.head()

# Feature Scaling
scaler = StandardScaler()

# 'customer_id' is not a feature
features = churn_df.drop(['customer_id', 'churn'], axis=1)
features_scaled = scaler.fit_transform(features)

# Target variable
target = churn_df['churn']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Instantiate the Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Logistic Regression predictions
logreg_pred = logreg.predict(X_test)

# Logistic Regression evaluation
print(confusion_matrix(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))

[[911 116]
 [243  30]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.84      1027
           1       0.21      0.11      0.14       273

    accuracy                           0.72      1300
   macro avg       0.50      0.50      0.49      1300
weighted avg       0.67      0.72      0.69      1300

# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Random Forest predictions
rf_pred = rf.predict(X_test)

# Random Forest evaluation
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

[[1026    1]
 [ 273    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.39      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300

Variable	Description
`customer_id`	Unique ID for each customer
`telecom_partner`	The telecom company the customer uses
`gender`	Customer’s gender
`age`	Customer’s age
`state`	Indian state where the customer lives
`city`	City where the customer lives
`pincode`	Postal code of the customer's area
`registration_event`	Date when the customer registered with the telecom company
`num_dependents`	Number of dependents (like children) the customer has
`estimated_salary`	Estimated salary of the customer

Variable	Description
`customer_id`	Unique ID for each customer
`calls_made`	Number of calls the customer has made
`sms_sent`	Number of SMS messages sent by the customer
`data_used`	Amount of data used by the customer
`churn`	Whether the customer has churned (1 = churned, 0 = not churned)

	customer_id	telecom_partner	gender	age	state	city	pincode	registration_event	num_dependents	estimated_salary
0	15169	Airtel	F	26	Himachal Pradesh	Delhi	667173	2020-03-16	4	85979
1	149207	Airtel	F	74	Uttarakhand	Hyderabad	313997	2022-01-16	0	69445
2	148119	Airtel	F	54	Jharkhand	Chennai	549925	2022-01-11	2	75949
3	187288	Reliance Jio	M	29	Bihar	Hyderabad	230636	2022-07-26	3	34272
4	14016	Vodafone	M	45	Nagaland	Bangalore	188036	2020-03-11	4	34157

	customer_id	calls_made	sms_sent	data_used	churn
0	15169	75	21	4532	1
1	149207	35	38	723	1
2	148119	70	47	4688	1
3	187288	95	32	10241	1
4	14016	66	23	5246	1

	customer_id	telecom_partner	gender	age	state	city	pincode	registration_event	num_dependents	estimated_salary	calls_made	sms_sent	data_used	churn
0	15169	Airtel	F	26	Himachal Pradesh	Delhi	667173	2020-03-16	4	85979	75	21	4532	1
1	149207	Airtel	F	74	Uttarakhand	Hyderabad	313997	2022-01-16	0	69445	35	38	723	1
2	148119	Airtel	F	54	Jharkhand	Chennai	549925	2022-01-11	2	75949	70	47	4688	1
3	187288	Reliance Jio	M	29	Bihar	Hyderabad	230636	2022-07-26	3	34272	95	32	10241	1
4	14016	Vodafone	M	45	Nagaland	Bangalore	188036	2020-03-11	4	34157	66	23	5246	1

Project Description¶

Calculate the proportion of customers who have churned.¶

Perform feature scaling separating the appropriate features and scale them.¶

Train Logistic Regression and Random Forest Classifier models.¶

Note that we only performed the basics of ML. We didnt perform EDA. We can use some other techniques and models (like RidgeClassifier) to improve the model's performance and accuracy, additionaly we can also utilize the the `registration_event` column to improve our modelling. Thank you!¶

	customer_id	age	pincode	num_dependents	estimated_salary	calls_made	sms_sent	data_used	churn	telecom_partner_Airtel	...	registration_event_2023-04-24	registration_event_2023-04-25	registration_event_2023-04-26	registration_event_2023-04-27	registration_event_2023-04-28	registration_event_2023-04-29	registration_event_2023-04-30	registration_event_2023-05-01	registration_event_2023-05-02	registration_event_2023-05-03
0	15169	26	667173	4	85979	75	21	4532	1	True	...	False	False	False	False	False	False	False	False	False	False
1	149207	74	313997	0	69445	35	38	723	1	True	...	False	False	False	False	False	False	False	False	False	False
2	148119	54	549925	2	75949	70	47	4688	1	True	...	False	False	False	False	False	False	False	False	False	False
3	187288	29	230636	3	34272	95	32	10241	1	False	...	False	False	False	False	False	False	False	False	False	False
4	14016	45	188036	4	34157	66	23	5246	1	False	...	False	False	False	False	False	False	False	False	False	False

Project Description¶

Calculate the proportion of customers who have churned.¶

Perform feature scaling separating the appropriate features and scale them.¶

Train Logistic Regression and Random Forest Classifier models.¶

Note that we only performed the basics of ML. We didnt perform EDA. We can use some other techniques and models (like RidgeClassifier) to improve the model's performance and accuracy, additionaly we can also utilize the the registration_event column to improve our modelling. Thank you!¶

Note that we only performed the basics of ML. We didnt perform EDA. We can use some other techniques and models (like RidgeClassifier) to improve the model's performance and accuracy, additionaly we can also utilize the the `registration_event` column to improve our modelling. Thank you!¶