
It takes a lot of manual effort to complete the evaluation process as even one college may contain thousands of students.
In this Data Science Project we will evaluate the Performance of a student using Machine Learning techniques and python.
You can download the data set you need for this project from here:
Let’s start with importing the libraries :
# for some basic operations import numpy as np import pandas as pd # for visualizations import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import dabl
To read the data set :
data = pd.read_csv('StudentsPerformance.csv') # getting the shape of the data print(data.shape)
#Output- (1000, 8)
To look at the first 5 records in the data set
data.head()

Descriptive Statistics
data.describe()

Lets check the no. of unique items present in the categorical column
data.select_dtypes('object').nunique()
#Output gender 2 race/ethnicity 5 parental level of education 6 lunch 2 test preparation course 2 dtype: int64
lets check the percentage of missing data in each columns present in the data :
no_of_columns = data.shape[0] percentage_of_missing_data = data.isnull().sum()/no_of_columns print(percentage_of_missing_data)
#Output gender 0.0 race/ethnicity 0.0 parental level of education 0.0 lunch 0.0 test preparation course 0.0 math score 0.0 reading score 0.0 writing score 0.0 dtype: float64
To see comparison of all other attributes with respect to Math Marks
plt.rcParams['figure.figsize'] = (18, 6) plt.style.use('fivethirtyeight') dabl.plot(data, target_col = 'math score')



Comparison of all other attributes with respect to Reading Marks :
plt.rcParams['figure.figsize'] = (18, 6) plt.style.use('fivethirtyeight') dabl.plot(data, target_col = 'reading score')



Lets check the Effect of Lunch on Student’s Performnce
data[['lunch','gender','math score','writing score', 'reading score']].groupby(['lunch','gender']).agg('median')

Lets check the Effect of Test Preparation Course on Scores
data[['test preparation course', 'gender', 'math score', 'writing score', 'reading score']].groupby(['test preparation course','gender']).agg('median')

Data Visualizations
Visualizing the number of male and female in the data set
plt.rcParams['figure.figsize'] = (15, 5) sns.countplot(data['gender'], palette = 'bone') plt.title('Comparison of Males and Females', fontweight = 30) plt.xlabel('Gender') plt.ylabel('Count') plt.show()

Visualizing the different groups in the data set
plt.rcParams['figure.figsize'] = (15, 9) plt.style.use('ggplot') sns.countplot(data['race/ethnicity'], palette = 'pink') plt.title('Comparison of various groups', fontweight = 30, fontsize = 20) plt.xlabel('Groups') plt.ylabel('count') plt.show()

Visualizing the different parental education levels
plt.rcParams['figure.figsize'] = (15, 9) plt.style.use('fivethirtyeight') sns.countplot(data['parental level of education'], palette = 'Blues') plt.title('Comparison of Parental Education', fontweight = 30, fontsize = 20) plt.xlabel('Degree') plt.ylabel('count') plt.show()

Visualizing Maths score
plt.rcParams['figure.figsize'] = (15, 9) plt.style.use('tableau-colorblind10') sns.countplot(data['math score'], palette = 'BuPu') plt.title('Comparison of math scores', fontweight = 30, fontsize = 20) plt.xlabel('score') plt.ylabel('count') plt.xticks(rotation = 90) plt.show()

Computing the total score for each student
import warnings warnings.filterwarnings('ignore') data['total_score'] = data['math score'] + data['reading score'] + data['writing score'] sns.distplot(data['total_score'], color = 'magenta') plt.title('comparison of total score of all the students', fontweight = 30, fontsize = 20) plt.xlabel('total score scored by the students') plt.ylabel('count') plt.show()

Computing percentage for each of the students
# importing math library to use ceil from math import * import warnings warnings.filterwarnings('ignore') data['percentage'] = data['total_score']/3 for i in range(0, 1000): data['percentage'][i] = ceil(data['percentage'][i]) plt.rcParams['figure.figsize'] = (15, 9) sns.distplot(data['percentage'], color = 'orange') plt.title('Comparison of percentage scored by all the students', fontweight = 30, fontsize = 20) plt.xlabel('Percentage scored') plt.ylabel('Count') plt.show()

Assigning grades to the grades according to the following criteria : 0 - 40 marks : grade E 41 - 60 marks : grade D 60 - 70 marks : grade C 70 - 80 marks : grade B 80 - 90 marks : grade A 90 - 100 marks : grade O
def getgrade(percentage, status): if status == 'Fail': return 'E' if(percentage >= 90): return 'O' if(percentage >= 80): return 'A' if(percentage >= 70): return 'B' if(percentage >= 60): return 'C' if(percentage >= 40): return 'D' else : return 'E' data['grades'] = data.apply(lambda x: getgrade(x['percentage'], x['status']), axis = 1 ) data['grades'].value_counts()
#Output B 260 C 252 D 223 A 156 O 58 E 51 Name: grades, dtype: int64
Label Encoding
from sklearn.preprocessing import LabelEncoder # creating an encoder le = LabelEncoder() # label encoding for test preparation course data['test preparation course'] = le.fit_transform(data['test preparation course']) # label encoding for lunch data['lunch'] = le.fit_transform(data['lunch']) # label encoding for race/ethnicity # we have to map values to each of the categories data['race/ethnicity'] = data['race/ethnicity'].replace('group A', 1) data['race/ethnicity'] = data['race/ethnicity'].replace('group B', 2) data['race/ethnicity'] = data['race/ethnicity'].replace('group C', 3) data['race/ethnicity'] = data['race/ethnicity'].replace('group D', 4) data['race/ethnicity'] = data['race/ethnicity'].replace('group E', 5) # label encoding for parental level of education data['parental level of education'] = le.fit_transform(data['parental level of education']) #label encoding for gender data['gender'] = le.fit_transform(data['gender']) # label encoding for pass_math data['pass_math'] = le.fit_transform(data['pass_math']) # label encoding for pass_reading data['pass_reading'] = le.fit_transform(data['pass_reading']) # label encoding for pass_writing data['pass_writing'] = le.fit_transform(data['pass_writing']) # label encoding for status data['status'] = le.fit_transform(data['status'])
Data Preparation
Splitting the dependent and independent variables
x = data.iloc[:,:14] y = data.iloc[:,14] print(x.shape) print(y.shape)
#Output-
(1000, 14)
(1000,)
Splitting the data set into training and test sets
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 45) print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape)
#Output
(750, 14)
(750,)
(250, 14)
(250,)
# importing the MinMaxScaler from sklearn.preprocessing import MinMaxScaler # creating a scaler mm = MinMaxScaler() # feeding the independent variable into the scaler x_train = mm.fit_transform(x_train) x_test = mm.transform(x_test)
Applying principal components analysis
from sklearn.decomposition import PCA # creating a principal component analysis model pca = PCA(n_components = None) # feeding the independent variables to the PCA model x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) # visualising the principal components that will explain the highest share of variance explained_variance = pca.explained_variance_ratio_ print(explained_variance) # creating a principal component analysis model pca = PCA(n_components = 2) # feeding the independent variables to the PCA model x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test)
Modelling
Logistic Regression
from sklearn.linear_model import LogisticRegression # creating a model model = LogisticRegression() # feeding the training data to the model model.fit(x_train, y_train) # predicting the test set results y_pred = model.predict(x_test) # calculating the classification accuracies print("Training Accuracy :", model.score(x_train, y_train)) print("Testing Accuracy :", model.score(x_test, y_test))
Output-
Training Accuracy : 0.3933333333333333
Testing Accuracy : 0.424
Printing the confusion matrix
from sklearn.metrics import confusion_matrix # creating a confusion matrix cm = confusion_matrix(y_test, y_pred) # printing the confusion matrix plt.rcParams['figure.figsize'] = (8, 8) sns.heatmap(cm, annot = True, cmap = 'Greens') plt.title('Confusion Matrix for Logistic Regression', fontweight = 30, fontsize = 20) plt.show()

Random Forest
from sklearn.ensemble import RandomForestClassifier # creating a model model = RandomForestClassifier() # feeding the training data to the model model.fit(x_train, y_train) # predicting the x-test results y_pred = model.predict(x_test) # calculating the accuracies print("Training Accuracy :", model.score(x_train, y_train)) print("Testing Accuracy :", model.score(x_test, y_test))
Output–
Training Accuracy : 0.9986666666666667
Testing Accuracy : 0.784
from sklearn.metrics import confusion_matrix # creating a confusion matrix cm = confusion_matrix(y_test, y_pred) # printing the confusion matrix plt.rcParams['figure.figsize'] = (8, 8) sns.heatmap(cm, annot = True, cmap = 'Reds') plt.title('Confusion Matrix for Random Forest', fontweight = 30, fontsize = 20) plt.show()

from pandas.plotting import radviz fig, ax = plt.subplots(figsize=(12, 12)) new_df = x.copy() new_df["status"] = y radviz(new_df, "status", ax=ax, colormap="rocket") plt.title('Radial Visualization for Target', fontsize = 20) plt.show()

It gives a clear Idea that Students getting very low grades have high correlation on Lunch and Parental Education
Amazing project Aman sir
[…] Data Science Project – Student Performance Analysis with Machine Learning […]
This is great 👍
It’s help me to grow my knowledge.thanks brother.
Thanks, Jeet😀, Keep Visiting us.