I did random oversampling to handle unbalanced positive and negative data. When I didn't do random oversampling, the accuracy I got was 88%, when I oversampled the train data, it got 87% accuracy and when I oversampled the train data and tests, the accuracy was 84%. Here's the oversampling code:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
Train_X2_rose, Train_Y2_rose = ros.fit_resample(Train_X2_Tfidf, Train_Y2)
Test_X2_rose, Test_Y2_rose = ros.fit_resample(Test_X2_Tfidf, Test_Y2)
#classification model trial 10
def reset_seeds():
np.random.seed(0)
python_random.seed(0)
tf.random.set_seed(0)
reset_seeds()
model10 = Sequential()
model10.add(Dense(10, input_dim= Train_X2_rose.shape[1], activation='sigmoid'))
model10.add(Dense(1, activation='sigmoid'))
opt = Adam (learning_rate=0.01)
model10.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model10.summary()
es = EarlyStopping(monitor="val_loss",mode='min',patience=10)
history10 = model10.fit(Train_X2_rose, Train_Y2_rose, epochs=1000, verbose=1,
validation_split=0.2, batch_size=64, callbacks =[es])
#prediction and confusion matrix
y_pred = model10.predict(Test_X2_rose) > 0.5
print(confusion_matrix(Test_Y2_rose, y_pred))
print(classification_report(Test_Y2_rose, y_pred))
print('Confusion matrix:')
confusion_matrix(Test_Y2_rose, y_pred)
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(Test_Y2_rose, y_pred), annot=True, fmt=".0f", ax=ax)
plt.xlabel("y_head")
plt.ylabel("y_true")
plt.show()
cm = confusion_matrix(Test_Y2_rose, y_pred)
Why does this decrease in accuracy occur? is there something wrong with my code? because I used hyperparameter tuning with 18 models and some models with oversampling showed a very strange plot loss and accuracy where the training and validation data lines split above and below.