import os
imdb_dir = 'D:/src/aclImdb'
train_dir = os.path.join(imdb_dir, 'train') # aclimdb Æú´õÀÇ ÈÆ·Ã µ¥ÀÌÅÍ ³»¿ëÀ» °¡Á®¿Â´Ù
labels = [] # labels¿Í texts ¶ó´Â µÎ °³ÀÇ ºó ¸®½ºÆ®¸¦ ¸¸µç´Ù
texts = []
for label_type in ['neg', 'pos']: # train Æú´õ¿¡ ÀÖ´Â pos 12,500 + neg 12,500°³ µ¥ÀÌÅÍ Àд´Ù
dir_name = os.path.join(train_dir, label_type) # neg¿Í pos Æú´õ °¢°¢¿¡ Á¢±ÙÇÑ´Ù
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt': # ¸¶Áö¸· 4 ±ÛÀÚ°¡ .txt ·Î ³¡³ª´ÂÁö¸¦ È®ÀÎÇÑ´Ù
f = open(os.path.join(dir_name, fname), encoding='utf8')
texts.append(f.read()) # ÅØ½ºÆ®¸¦ ÀÐ¾î¼ texts ¸®½ºÆ®¿¡ ¿¬°áÇÑ´Ù
f.close() if label_type == 'neg': # ¸¸¾à ÇöÀç Æú´õ°¡ neg Æú´õ¶ó¸é
labels.append(0) # texts¿Í °°Àº ¼ø¼ÀÇ labels ¸®½ºÆ®¿¡´Â 0À» ÀúÀåÇÑ´Ù
else:
labels.append(1) # pos Æú´õ¶ó¸é °°Àº ¼ø¼ÀÇ labels ¸®½ºÆ®¿¡ 1À» ÀúÀåÇÑ´Ù
# µ¥ÀÌÅÍ È®ÀÎ
print('texts 0:', texts[0])
print('texts len:', len(texts))
print('labels 0:', labels[0])
print('labels len:', len(labels))
# ÅØ½ºÆ®¿¡ »ç¿ëµÈ ´Ü¾îÀÇ Á¾·ù¸¦ ºóµµ ¼øÀ¸·Î Á¤·ÄÇÏ´Â ÀÛ¾÷À» ¼öÇàÇÑ´Ù
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import math
validation_ratio = math.floor(len(texts) * 0.3) # °ËÁõ »ùÇÃÀº ÀüüÀÇ 30%·Î ÇÑ´Ù
max_words = 10000 # µ¥ÀÌÅͼ¿¡¼ °¡Àå ºóµµ ³ôÀº 10,000 °³ÀÇ ´Ü¾î¸¸ »ç¿ëÇÑ´Ù
maxlen = 200 # Ç×»ó 200 ´Ü¾î°¡ µÇµµ·Ï ±æÀ̸¦ °íÁ¤ÇÑ´Ù
tokenizer = Tokenizer(num_words=max_words) # »óÀ§ºóµµ 10,000 °³ÀÇ ´Ü¾î¸¸À» Ãß·Á³»´Â Tokenizer °´Ã¼ »ý¼º
tokenizer.fit_on_texts(texts) # ´Ü¾î À妽º¸¦ ±¸ÃàÇÑ´Ù
word_index = tokenizer.word_index # ´Ü¾î À妽º¸¸ °¡Á®¿Â´Ù
# Tokenizing °á°ú È®ÀÎ
print('Àüü¿¡¼ %s°³ÀÇ °íÀ¯ÇÑ ÅäÅ«À» ã¾Ò½À´Ï´Ù.' % len(word_index))
print('word_index type: ', type(word_index))
print('word_index: ', word_index)
# ¹®ÀÚ¸¦ ¼ýÀÚ·Î º¯È¯ÇÏ´Â ÀÛ¾÷À» ¼öÇàÇÑ´Ù
# »óÀ§ ºóµµ 10,000(max_words)°³ÀÇ ´Ü¾î¸¸ ÃßÃâÇÏ¿© word_indexÀÇ ¼ýÀÚ ¸®½ºÆ®·Î º¯È¯ÇÑ´Ù.
data = tokenizer.texts_to_sequences(texts) # Tokenizer °á°ú°¡ ¿©±â¼ ¹Ý¿µµÈ´Ù.
print('data 0:', data[0])
# PaddingÀº µ¥ÀÌÅÍÀÇ ±æÀ̸¦ °íÁ¤½ÃÄÑ ÁØ´Ù
# ÁöÁ¤µÈ ±æÀÌ¿¡ ¸ðÀÚ¶ó´Â °ÍÀº 0À¸·Î ä¿ì°í, ³ÑÄ¡´Â °ÍÀº Àß¶ó³½´Ù
# ÅÙ¼ÀÇ Å©±â¸¦ ¸ÂÃç¾ß ÇÏ´Â °æ¿ì À¯¿ëÇÏ´Ù
# one-hot encoding µîÀ» ÅëÇØ ±æÀ̰¡ °íÁ¤µÉ ¼ö ÀÖ´Ù¸é ÇÏÁö ¾Ê¾Æµµ µÈ´Ù
# ´Ü¾îÀÇ ¼±ÅÃÀº µÚ¿¡¼ºÎÅÍ Çϸç, nested list¸¦ 2D ÅÙ¼(2Â÷¿ø ³ÑÆÄÀÌ ¹è¿)·Î ¸¸µç´Ù
from keras.preprocessing.sequence import pad_sequences
sequences = [[1, 2, 3, 4, 5], [1, 2, 3, 4], [1]] # nested list
padded = pad_sequences(sequences, maxlen=3) # 2D tensor
print(padded)
data = pad_sequences(data, maxlen=maxlen)
print('data:', data)
print('data 0:', data[0])
# one-hot encodingÀº ¸ðµç ¼ýÀÚ¸¦ 0°ú 1·Î¸¸ ¸¸µç´Ù
# ¿ø-ÇÖ ÀÎÄÚµù ÇÔ¼ö
def to_one_hot(sequences, dimension):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
# data¸¦ one-hot-ÀÎÄÚµùÀ¸·Î 0°ú 1ÀÇ º¤ÅÍ·Î º¯È¯
# labelÀº ÀÌ¹Ì 0°ú 1·Î űëµÇ¾î ÀÖÀ¸¹Ç·Î, list¿¡¼ ³ÑÆÄÀÌ ¹è¿·Î¸¸ º¯È¯. float32¸¦ ÁöÁ¤ÇÏÁö ¾ÊÀ¸¸é int32·Î ÀúÀåµÈ´Ù
data = to_one_hot(data, dimension=max_words)
labels = np.asarray(labels).astype('float32')
print('data:', data)
len(data[0]) # dimension=10000À¸·Î ÇßÀ¸¹Ç·Î °¢ ÇàÀº 10,000°³¸¦ °¡Áö°í ÀÖ´Ù
print('data [0][0:100]:', data[0][0:100])
## Train µ¥ÀÌÅÍ¿Í Validation µ¥ÀÌÅÍ Áغñ
print('µ¥ÀÌÅÍ ÅÙ¼ÀÇ Å©±â:', data.shape) # (25000, 10000)
print('·¹À̺í ÅÙ¼ÀÇ Å©±â:', labels.shape) # (25000,) data¿Í labelÀÌ ¸ðµÎ 2D ÅÙ¼°¡ µÇ¾úÀ½
indices = np.arange(data.shape[0]) # 0 ~ 24999 ±îÁöÀÇ ¼ýÀÚ¸¦ »ý¼º
np.random.shuffle(indices) # 0 ~ 24999 ±îÁöÀÇ ¼ýÀÚ¸¦ ·£´ýÇÏ°Ô ¼¯À½
data = data[indices] # À̰ÍÀ» À妽º·Î ÇÏ¿© 2D ÅÙ¼ µ¥ÀÌÅ͸¦ ¼¯À½
labels = labels[indices] # labelµµ °°Àº ¼ø¼·Î ¼¯À½
x_train = data[validation_ratio:] # ÈÆ·Ãµ¥ÀÌÅÍÀÇ 70%¸¦ ÈÆ·Ãµ¥ÀÌÅÍ
y_train = labels[validation_ratio:] # ÈÆ·Ãµ¥ÀÌÅÍÀÇ 70%¸¦ ÈÆ·Ãµ¥ÀÌÅÍ Label (data¿Í labels´Â °°Àº ¼ø¼)
x_val = data[:validation_ratio] # ÈÆ·Ãµ¥ÀÌÅÍÀÇ 30%¸¦ °ËÁõµ¥ÀÌÅÍ
y_val = labels[:validation_ratio] # ÈÆ·Ãµ¥ÀÌÅÍÀÇ 30%¸¦ °ËÁõµ¥ÀÌÅÍ Label
## ¸ðµ¨ Á¤ÀÇÇϱâ
from keras.models import Sequential
from keras.layers import Dense
model = Sequential() # ¸ðµ¨À» »õ·Î Á¤ÀÇ
model.add(Dense(64, activation='relu', input_shape=(max_words,))) # ù ¹øÂ° Àº´ÐÃþ
model.add(Dense(32, activation='relu')) # µÎ ¹øÂ° Àº´ÐÃþ
model.add(Dense(1, activation='sigmoid')) # Ãâ·ÂÃþ
model.summary()
# ¸ðµ¨ ÄÄÆÄÀÏ
# °¡ÁßÄ¡ ¾÷µ¥ÀÌÆ® ¹æ¹ýÀº RMSpropÀ» »ç¿ëÇÏ¿´´Ù. À̵¿Æò±ÕÀÇ ¹æ¹ýÀ» µµÀÔÇÏ¿© Á¶ÀýÇØ°£´Ù
# ½Å°æ¸ÁÀÇ Ãâ·ÂÀÌ È®·üÀ̹ǷΠcrossentropy¸¦ »ç¿ëÇÏ´Â °ÍÀÌ ÃÖ¼±ÀÌ´Ù
# crossentropy´Â ¿øº»ÀÇ È®·ü ºÐÆ÷¿Í ¿¹ÃøÀÇ È®·ü ºÐÆ÷¸¦ ÃøÁ¤ÇÏ¿© Á¶ÀýÇØ °£´Ù
# ¶ÇÇÑ ÀÌÁø ºÐ·ùÀ̹ǷΠbinary_crossentropy¸¦ »ç¿ëÇÑ´Ù
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# ¸ðµ¨ ÈÆ·Ã
# 32°³¾¿ ¹Ì´Ï ¹èÄ¡¸¦ ¸¸µé¾î 10¹øÀÇ epoch·Î ÈÆ·ÃÇÑ´Ù. º¸Åë 32°³¿¡¼ ½ÃÀÛÇÏ¿© 512°³±îÁö Áß¿¡¼ ã´Â´Ù
# ÈÆ·Ã µ¥ÀÌÅÍ·Î ÈÆ·ÃÇϰí, °ËÁõ µ¥ÀÌÅÍ·Î °ËÁõÇÑ´Ù
# ¹Ýȯ°ªÀÇ history´Â ÈÆ·ÃÇÏ´Â µ¿¾È ¹ß»ýÇÑ ¸ðµç Á¤º¸¸¦ ´ã°í ÀÖ´Â µñ¼Å³Ê¸®ÀÌ´Ù
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
history_dict = history.history
# multidimensional numpy arrays¸¦ ÀúÀåÇÒ ¼ö ÀÖ´Â h5 file(HDF) Æ÷¸ËÀ¸·Î ÀúÀåÇÑ´Ù
model.save('text_binary_model.h5')
# ÈÆ·Ãµ¥ÀÌÅÍ¿¡¼ »ç¿ëµÈ »óÀ§ºóµµ 10,000°³ÀÇ ´Ü¾î·Î µÈ Tokenizer ÀúÀå
# »õ·Î ÀԷµǴ ¹®Àå¿¡¼µµ °°Àº ´Ü¾î°¡ ÃßÃâµÇ°Ô ÇÑ´Ù
import pickle
with open('text_binary_tokenizer', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# history µñ¼Å³Ê¸® ¾È¿¡ ÀÖ´Â Á¤È®µµ¿Í ¼Õ½Ç°ªÀ» °¡Á®¿Í º»´Ù
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
print('Accuracy of each epoch:', acc) # [0.79, 0.90, 0.93, 0.94, 0.96, 0.97, 0.98, 0.98, 0.98, 0.99]
epochs = range(1, len(acc) +1) # range(1, 11)
import matplotlib.pyplot as plt
# ÈÆ·Ãµ¥ÀÌÅÍÀÇ Á¤È®µµ¿¡ ºñÇØ °ËÁõµ¥ÀÌÅÍÀÇ Á¤È®µµ´Â ³·°Ô ³ªÅ¸³´Ù
# epoch°¡ ³ô¾ÆÁö¸é ¸ðµ¨Àº ÈÆ·Ãµ¥ÀÌÅÍ¿¡ ¸Å¿ì ¹Î°¨ÇØÁ® ¿ÀÈ÷·Á »õ·Î¿î µ¥ÀÌÅ͸¦ Àß ¸ø ¸ÂÃá´Ù
plt.plot(epochs, acc, 'bo', label='Training Acc')
plt.plot(epochs, val_acc, 'b', label='Validation Acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure() # »õ·Î¿î ±×¸²À» ±×¸°´Ù
# ÈÆ·Ãµ¥ÀÌÅÍÀÇ ¼Õ½Ç°ªÀº ³·¾ÆÁö³ª, °ËÁõµ¥ÀÌÅÍÀÇ ¼Õ½Ç°ªÀº ³ô¾ÆÁø´Ù
# ¼Õ½Ç°ªÀº ¿À·ù°ªÀ» ¸»ÇÑ´Ù. ¿¹Ãø°ú Á¤´äÀÇ Â÷À̸¦ °Å¸® °è»êÀ¸·Î ±¸ÇÑ °ªÀÌ´Ù
plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()