# Import necessary libraries
import pandas as pd

# Load the dataset and create a copy
ds_jobs = pd.read_csv("customer_train.csv")
ds_jobs_transformed = ds_jobs.copy()

ds_jobs.head()

# EDA to help identify ordinal, nominal, and two-factor categories
for col in ds_jobs.select_dtypes("object").columns:
    print(ds_jobs_transformed[col].value_counts(), '\n')

city
city_103    4355
city_21     2702
city_16     1533
city_114    1336
city_160     845
            ... 
city_129       3
city_111       3
city_121       3
city_140       1
city_171       1
Name: count, Length: 123, dtype: int64 

gender
Male      13221
Female     1238
Other       191
Name: count, dtype: int64 

relevant_experience
Has relevant experience    13792
No relevant experience      5366
Name: count, dtype: int64 

enrolled_university
no_enrollment       13817
Full time course     3757
Part time course     1198
Name: count, dtype: int64 

education_level
Graduate          11598
Masters            4361
High School        2017
Phd                 414
Primary School      308
Name: count, dtype: int64 

major_discipline
STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: count, dtype: int64 

experience
>20    3286
5      1430
4      1403
3      1354
6      1216
2      1127
7      1028
10      985
9       980
8       802
15      686
11      664
14      586
1       549
<1      522
16      508
12      494
13      399
17      342
19      304
18      280
20      148
Name: count, dtype: int64 

company_size
50-99        3083
100-499      2571
10000+       2019
10-49        1471
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
Name: count, dtype: int64 

company_type
Pvt Ltd                9817
Funded Startup         1001
Public Sector           955
Early Stage Startup     603
NGO                     521
Other                   121
Name: count, dtype: int64 

last_new_job
1        8040
>4       3290
2        2900
never    2452
4        1029
3        1024
Name: count, dtype: int64

# Create a dictionary of columns containing ordered categorical data
ordered_cats = {
    'enrolled_university': ['no_enrollment', 'Part time course', 'Full time course'],
    'education_level': ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd'],
    'experience': ['<1'] + list(map(str, range(1, 21))) + ['>20'],
    'company_size': ['<10', '10-49', '50-99', '100-499', '500-999', '1000-4999', '5000-9999', '10000+'],
    'last_new_job': ['never', '1', '2', '3', '4', '>4']
}

# Create a mapping dictionary of columns containing two-factor categories to convert to Booleans
two_factor_cats = {
    'relevant_experience': {'No relevant experience': False, 'Has relevant experience': True},
    'job_change': {0.0: False, 1.0: True}
}

# Loop through DataFrame columns to efficiently change data types
for col in ds_jobs_transformed:
    
    # Convert two-factor categories to bool
    if col in ['relevant_experience', 'job_change']:
        ds_jobs_transformed[col] = ds_jobs_transformed[col].map(two_factor_cats[col])
    
    # Convert integer columns to int32
    elif col in ['student_id', 'training_hours']:
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('int32')
    
    # Convert float columns to float16
    elif col == 'city_development_index':
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('float16')
    
    # Convert columns containing ordered categorical data to ordered categories using dict
    elif col in ordered_cats.keys():
        category = pd.CategoricalDtype(ordered_cats[col], ordered=True)
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype(category)
    
    # Convert remaining columns to standard categories
    else:
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('category')

# Filter students with 10 or more years experience at companies with at least 1000 employees
ds_jobs_transformed = ds_jobs_transformed[(ds_jobs_transformed['experience'] >= '10') & (ds_jobs_transformed['company_size'] >= '1000-4999')]
ds_jobs_transformed.head()

Column	Description
`student_id`	A unique identifier for each student.
`city`	A code representing the city where the student lives.
`city_development_index`	A scaled index that measures the development level of the city.
`gender`	The gender of the student.
`relevant_experience`	Indicates if the student has work experience relevant to their field.
`enrolled_university`	The type of university course the student is enrolled in, if any.
`education_level`	The highest education level attained by the student.
`major_discipline`	The main educational discipline or field of study of the student.
`experience`	The total number of years of work experience the student has.
`company_size`	The size of the company where the student is currently employed (number of employees).
`company_type`	The type of company the student works for.
`last_new_job`	The number of years between the student's current job and their previous job.
`training_hours`	The total number of training hours the student has completed.
`job_change`	A binary indicator where `1` means the student is looking for a new job, and `0` means they are not.

Project Description¶

Store the columns containing categories with only two factors as Booleans (bool)¶

Store the columns containing integers only as 32-bit integers (int32)¶

Store the columns containing floats as 16-bit floats (float16)¶

Store the columns containing nominal categorical data as the category data type¶

Store the columns containing ordinal categorical data ordered categories¶

Filter the final result to only contain students with 10 or more years of experience at companies with at least 1000 employees, as their recruiter base is suited to more experienced professionals at enterprise companies¶

	student_id	city	city_development_index	gender	relevant_experience	enrolled_university	education_level	major_discipline	experience	company_size	company_type	last_new_job	training_hours	job_change
0	8949	city_103	0.920	Male	Has relevant experience	no_enrollment	Graduate	STEM	>20	NaN	NaN	1	36	1.0
1	29725	city_40	0.776	Male	No relevant experience	no_enrollment	Graduate	STEM	15	50-99	Pvt Ltd	>4	47	0.0
2	11561	city_21	0.624	NaN	No relevant experience	Full time course	Graduate	STEM	5	NaN	NaN	never	83	0.0
3	33241	city_115	0.789	NaN	No relevant experience	NaN	Graduate	Business Degree	<1	NaN	Pvt Ltd	never	52	1.0
4	666	city_162	0.767	Male	Has relevant experience	no_enrollment	Masters	STEM	>20	50-99	Funded Startup	4	8	0.0

	student_id	city	city_development_index	gender	relevant_experience	enrolled_university	education_level	major_discipline	experience	company_size	company_type	last_new_job	training_hours	job_change
9	699	city_103	0.919922	NaN	True	no_enrollment	Graduate	STEM	17	10000+	Pvt Ltd	>4	123	False
12	25619	city_61	0.913086	Male	True	no_enrollment	Graduate	STEM	>20	1000-4999	Pvt Ltd	3	23	False
31	22293	city_103	0.919922	Male	True	Part time course	Graduate	STEM	19	5000-9999	Pvt Ltd	>4	141	False
34	26494	city_16	0.910156	Male	True	no_enrollment	Graduate	Business Degree	12	5000-9999	Pvt Ltd	3	145	False
40	2547	city_114	0.925781	Female	True	Full time course	Masters	STEM	16	1000-4999	Public Sector	2	14	False