Below is tested and fully working code of yours:
data_train = pd.read_csv(r"train.csv")
data_test = pd.read_csv(r"test.csv")
columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]
data_train = data_train.dropna()
data_test = data_test.dropna()
data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)
from sklearn import preprocessing
def encode_features(df_train, df_test):
features = ['HomeTeam', 'AwayTeam']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])
df_train[feature] = le.transform(df_train[feature])
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test
data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())
# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict
y_all = data_train['Full_Time_Home_Goals']
X_all = data_train.drop(['Full_Time_Home_Goals'], axis=1)
from sklearn.model_selection import train_test_split
num_test = 0.20 # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
# Using Random Forest and using parameters that we defined
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt', 'auto'],
'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]
}
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))
ids = data_test['Id']
predictions = clf.predict(data_test)
df_preds = pd.DataFrame({"id":ids, "predictions":predictions})
df_preds
Id HomeTeam AwayTeam Full_Time_Home_Goals
0 1 55 440 3
1 2 158 493 2
2 3 178 745 1
3 4 185 410 1
4 5 249 57 2
Id HomeTeam AwayTeam
0 190748 284 54
1 190749 124 441
2 190750 446 57
3 190751 185 637
4 190752 749 482
0.33213786556261704
id predictions
0 190748 1
1 190749 1
2 190750 1
3 190751 1
4 190752 1
... ... ...
375 191123 1
376 191124 1
377 191125 1
378 191126 1
379 191127 1
380 rows × 2 columns
6
solved Predicting numerical features based on string features using sk-learn