from gensim.models import Word2Vec
dimension = 128
sentences = [st.split() for st in cleaned_words]
w2v_model = Word2Vec(sentences, min_count=1, vector_size=dimension, workers=6, sg=1, epochs=1000)
w2v_model.save('w2v_model.bin')
new_model = Word2Vec.load('w2v_model.bin')
embedding_matrix = np.zeros((vocab_size, dimension))
for word, i in train_word_tokenizer.word_index.items():
if word in new_model.wv.key_to_index:
embedding_vector = new_model.wv[word]
embedding_matrix[i] = embedding_vector
adam = Adam(lr=0.0001)
def create_model(vocab_size, max_length):
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length = max_length, trainable = False, weights=[embedding_matrix]))
model.add(Bidirectional(GRU(128, activation = "relu")))
model.add(Dense(128, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(num_classes, activation = "softmax"))
return model
model = create_model(vocab_size, max_length)
model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
model.summary()
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
EPOCHS = 10
hist = model.fit(train_X, train_Y, epochs = EPOCHS, batch_size = BS, validation_data = (val_X, val_Y), callbacks = [checkpoint])
predict_model = load_model(filename)
predict_model.summary()
score = predict_model.evaluate(val_X, val_Y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])