# import the required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline


# load the dataset
insurance = pd.read_csv('insurance.csv', )
insurance.head()

def clean_dataset(insurance):
    """
    Cleans the insurance dataset by performing several preprocessing tasks:
    - Corrects the 'sex' column values to a standard format ('male', 'female').
    - Removes the dollar sign from the 'charges' column and converts it to float.
    - Drops negative 'age' values.
    - Converts negative 'children' values to zero.
    - Converts 'region' values to lowercase.
    - Drops rows with any missing values.
    
    Parameters:
    - insurance: pandas DataFrame, the insurance dataset.
    
    Returns:
    - DataFrame after cleaning.
    """
    insurance['sex'] = insurance['sex'].replace({'M': 'male', 'man': 'male', 'F': 'female', 'woman': 'female'})
    insurance['charges'] = insurance['charges'].replace({'\$': ''}, regex=True).astype(float)
    insurance = insurance[insurance["age"] > 0]
    insurance.loc[insurance["children"] < 0, "children"] = 0
    insurance.loc[:, "region"] = insurance["region"].str.lower()

    return insurance.dropna()

def create_and_evaluate_regression_model(insurance):
    """
    Prepares the data, fits a linear regression model, and evaluates it using cross-validation.
    
    Parameters:
    - insurance: pandas DataFrame, the cleaned insurance dataset.
    
    Returns:
    - A tuple containing the fitted sklearn Pipeline object, mean MSE, and mean R2 scores.
    """
    # Preprocessing
    X = insurance.drop('charges', axis=1)
    y = insurance['charges']
    categorical_features = ['sex', 'smoker', 'region']
    numerical_features = ['age', 'bmi', 'children']
    
    # Convert categorical variables to dummy variables
    X_categorical = pd.get_dummies(X[categorical_features], drop_first=True)
    
    # Combine numerical features with dummy variables
    X_processed = pd.concat([X[numerical_features], X_categorical], axis=1)
    # Scaling numerical features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_processed), columns=X_processed.columns)
    # Linear regression model
    lin_reg = LinearRegression()
    
    # Pipeline
    steps = [("scaler", scaler), ("lin_reg", lin_reg)]
    insurance_model_pipeline = Pipeline(steps)
    
    # Fitting the model
    insurance_model_pipeline.fit(X_scaled, y)
    
    # Evaluating the model
    mse_scores = -cross_val_score(insurance_model_pipeline, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(insurance_model_pipeline, X_scaled, y, cv=5, scoring='r2')
    mean_mse = np.mean(mse_scores)
    mean_r2 = np.mean(r2_scores)
    
    return insurance_model_pipeline, mean_mse, mean_r2

# Usage example
cleaned_insurance = clean_dataset(insurance)
insurance_model, mean_mse, r2_score = create_and_evaluate_regression_model(cleaned_insurance)
print("Mean MSE:", mean_mse)
print("Mean R2:", r2_score)

Mean MSE: 37431001.52191916
Mean R2: 0.7450511466263761

# Predict on validation data
validation_data = pd.read_csv('validation_dataset.csv')

# Ensure categorical variables are properly transformed
validation_data_processed = pd.get_dummies(validation_data, columns=['sex', 'smoker', 'region'], drop_first=True)

# Make predictions using the trained model
validation_predictions = insurance_model.predict(validation_data_processed)

# Add predicted charges to the validation data
validation_data['predicted_charges'] = validation_predictions

# Adjust predictions to ensure minimum charge is $1000
validation_data.loc[validation_data['predicted_charges'] < 1000, 'predicted_charges'] = 1000

# Display the updated dataframe
validation_data.head()

Column	Description
`age`	Age of the person (main customer).
`sex`	Gender of the person (`male` or `female`).
`bmi`	Body Mass Index — a number that shows if someone is underweight, normal, or overweight.
`children`	How many dependents (kids) are covered by the insurance.
`smoker`	Whether the person smokes (`yes` or `no`).
`region`	The region where the person lives (Northeast, Northwest, Southeast, Southwest).
`charges`	Medical costs billed by the insurance company (our target column).

	age	sex	bmi	children	smoker	region	charges
0	19.0	female	27.900	0.0	yes	southwest	16884.924
1	18.0	male	33.770	1.0	no	Southeast	1725.5523
2	28.0	male	33.000	3.0	no	southeast	$4449.462
3	33.0	male	22.705	0.0	no	northwest	$21984.47061
4	32.0	male	28.880	0.0	no	northwest	$3866.8552

	age	sex	bmi	children	smoker	region	predicted_charges
0	18.0	female	24.090000	1.0	no	southeast	128624.195643
1	39.0	male	26.410000	0.0	yes	northeast	220740.537449
2	27.0	male	29.150000	0.0	yes	southeast	181357.588606
3	71.0	male	65.502135	13.0	yes	southeast	423490.687270
4	28.0	male	38.060000	0.0	no	southeast	193247.431989

Project Description¶

Goal¶

Dataset Summary¶

insurance.csv¶

Final Step¶