Here I will help you by providing you the template code that you can use and modify according to your needs. I am only going to do for one algorithm then it is your responsibility to do it for others.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# read the dataset
df = pd.read_csv("data.csv")
# shape of the dataset
df.shape
# column names of the dataset
df.columns
# descriptive statistics
df.describe()
# checking for missing values
df.isnull().sum()
# checking the data type of each column
df.dtypes
# visualizing the relationship between the features and the target variable
for column in df.columns:
if column != 'target':
plt.scatter(df[column], df['target'])
plt.xlabel(column)
plt.ylabel('target')
plt.show()
# visualizing the distribution of the features
for column in df.columns:
plt.hist(df[column])
plt.xlabel(column)
plt.ylabel('count')
plt.show()
# visualizing the correlation between the features
plt.figure(figsize = (12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
# Data Transformation including removing or imputing missing values, standardization, and normalization
# imputing missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df)
df = imputer.transform(df)
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
df = scaler.transform(df)
# normalization
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(df)
df = normalizer.transform(df)
# Class balancing
# checking the number of classes
df['target'].value_counts()
# splitting the dataset into feature and target variables
X = df.drop(['target'], axis=1)
y = df['target']
# balancing the dataset using the SMOTE technique
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
# checking the number of classes after balancing
pd.Series(y_res).value_counts()
# Principal Component Analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(X_res)
X_pca = pca.transform(X_res)
# Model training
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_pca, y_res)
# Model Validation including performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_pred = model.predict(X_pca)
print("Accuracy: ", accuracy_score(y_res, y_pred))
print("Precision: ", precision_score(y_res, y_pred))
print("Recall: ", recall_score(y_res, y_pred))
print("F1 Score: ", f1_score(y_res, y_pred))
# Feature importance
feature_importances = model.feature_importances_
# plotting the features in descending order of their importance
indices = np.argsort(feature_importances)[::-1]
plt.figure(figsize=(15,10))
plt.title("Feature Importance")
plt.bar(range(X_pca.shape[1]), feature_importances[indices], color="r", align="center")
plt.xticks(range(X_pca.shape[1]), indices)
plt.xlim([-1, X_pca.shape[1]])
plt.show()
Now in most part the code will going to remain same. Here I suggest you to learn the meaning of words that you find unfamiliar. In addition, also refer to the documentation of the library that I have used. It might make your remaining task easier. Also, learn the concepts after tomorrow as homework is for you. Its fun, give it a try.
1
solved can you guys help me by telling how to do these tasks using python [closed]