基于遗传算法特征选择及单层感知机模型的IMDB电影评论文本分类案例
2023-12-23 21:38:26
1.数据载入及处理
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
max_features = 10000
maxlen = 200
batch_size = 32
# 加载IMDB数据集
print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')
# 限定评论长度,并进行填充
print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)[:2000]
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)[:2000]
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)
# 将整数序列转换为文本
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in input_train[0]])
# 使用词袋模型表示文本
vectorizer = CountVectorizer(max_features=max_features)
X_train = vectorizer.fit_transform([' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in input_train])
X_test = vectorizer.transform([' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in input_test])
# 转换数据为PyTorch张量
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
batch_size = 2000
train_iter = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size)
test_iter = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size)
2.感知机模型建立
# 定义感知机网络
class Perceptron(nn.Module):
def __init__(self, input_size):
super(Perceptron, self).__init__()
self.fc = nn.Linear(input_size, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.fc(x)
x = self.sigmoid(x)
return x
# 训练感知机模型
def train(model, iterator, optimizer, criterion):
model.train()
for batch in iterator:
optimizer.zero_grad()
text, label = batch
predictions = model(text).squeeze(1)
loss = criterion(predictions, label)
loss.backward()
optimizer.step()
# 测试感知机模型
def evaluate(model, iterator, criterion):
model.eval()
total_loss = 0
total_correct = 0
with torch.no_grad():
for batch in iterator:
text, label = batch
predictions = model(text).squeeze(1)
loss = criterion(predictions, label)
total_loss += loss.item()
rounded_preds = torch.round(predictions)
total_correct += (rounded_preds == label).sum().item()
return total_loss / len(iterator), total_correct / len(iterator.dataset)
# 初始化感知机模型
input_size = X_train_tensor.shape[1]
model = Perceptron(input_size)
3.模型训练
# # 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
N_EPOCHS = 10
eval_acc_list = []
for epoch in range(N_EPOCHS):
train(model, train_iter, optimizer, criterion)
eval_loss, eval_acc = evaluate(model, test_iter, criterion)
eval_acc_list.append(eval_acc)
print(f'Epoch: {epoch+1}, Test Loss: {eval_loss:.3f}, Test Acc: {eval_acc*100:.2f}%')
plt.plot(range(N_EPOCHS), eval_acc_list)
plt.title('Test Accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()
4.遗传算法进行特征选择
# 随机初始化染色体
def initialize_population(population_size, num_genes):
# # Option 1:
# p=np.array([0.05,0.95])
# return np.random.choice([0, 1], size=(population_size, num_genes), p=p.ravel())
# Option 2:
return np.random.choice([0, 1], size=(population_size, num_genes))
# 计算适应值,以分类器的准确度
def calculate_fitness(population, model, criterion):
fitness = []
for chromosome in population: # population: a 0-1 sequence
selected_features = np.where(chromosome == 1)[0]
# 更新模型输入维度
input_dim = len(selected_features)
model.fc = nn.Linear(input_dim, 1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
idx = torch.tensor(selected_features)
train_iter = DataLoader(TensorDataset(X_train_tensor[:, idx], y_train_tensor), batch_size)
test_iter = DataLoader(TensorDataset(X_test_tensor[:, idx], y_test_tensor), batch_size)
# 训练并获取准确度
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
train(model, train_iter, optimizer, criterion)
test_loss, test_acc = evaluate(model, test_iter, criterion)
model.train()
fitness.append(test_acc)
return np.array(fitness)
# 选择
def selection(population, fitness): # input populations and their accuracy
probabilities = fitness / sum(fitness) # the accuracy-based probability of selection
# # Option 1: no random in selection, choose the top 2 as parents
# probabilities_copy = probabilities.copy()
# probabilities_copy.sort()
# max_1 = probabilities_copy[-1]
# max_2 = probabilities_copy[-2]
# max_1_index = np.where(probabilities == max_1)
# max_2_index = np.where(probabilities == max_2)
# selected_indices = [max_1_index[0].tolist()[0], max_2_index[0].tolist()[0]] * 25
# Option 2: random
selected_indices = np.random.choice(range(len(population)), size=len(population), p=probabilities)
return population[selected_indices]
# 交叉
def crossover(parents, crossover_rate):
children = []
for i in range(0, len(parents), 2):
parent1, parent2 = parents[i], parents[i + 1]
if np.random.rand() < crossover_rate:
crossover_point = np.random.randint(1, len(parent1))
child1 = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
child2 = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
else:
child1, child2 = parent1, parent2
children.extend([child1, child2])
return np.array(children)
# 变异
def mutation(children, mutation_rate):
for i in range(len(children)):
mutation_points = np.where(np.random.rand(len(children[i])) < mutation_rate)[0]
children[i][mutation_points] = 1 - children[i][mutation_points] # key
return children
# 定义遗传算法的主函数
def genetic_algorithm(population_size, num_genes, generations, crossover_rate, mutation_rate, model, criterion):
# 初始化染色体
population = initialize_population(population_size, num_genes)
fitness_list = []
for generation in range(generations):
print('Generation', generation+1, ":")
fitness = calculate_fitness(population, model, criterion) # return a list (1, population_size) with history test acc
# 选择
selected_population = selection(population, fitness) # return a list, (population_size, num_genes / input_size / sentence_length), each adjacent are parents
# 交叉
children = crossover(selected_population, crossover_rate)
# 变异
mutated_children = mutation(children, mutation_rate)
# 形成新种群
population = mutated_children
# 输出当前最优解
best_individual = population[np.argmax(fitness)]
fitness_list.append(fitness.max())
print(f"Generation {generation + 1}, Best Individual: {best_individual}, Fitness: {fitness.max()}")
plt.plot(range(generations), fitness_list)
plt.title('Test Accuracy with feature selection via genetic algorithm')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()
# 返回最优解
best_individual = population[np.argmax(fitness)]
return best_individual
# 调用遗传算法
model = Perceptron(input_size)
best_solution = genetic_algorithm(population_size=50, num_genes=input_size, generations=10, crossover_rate=0.8, mutation_rate=0.1, model=model, criterion=criterion)
print(f"Final Best Solution: {best_solution}")
# 解释最优解
selected_features = np.where(best_solution == 1)[0]
print(f"Selected Features: {selected_features}")
print("Shape of Selected Features = ",selected_features.shape)
注意
- 在本任务中,
selection
函数中第一个option 1仅选择效果最好的两个染色体作为父母比option 2在population中随机选择的效率更高(10轮次后,验证集精度74%>71%); - 在本任务中,初始化
initialize_population
函数中指定选择更多的特征(95%, Option 1)比随机选择特征(50%, Option 2)的效率更高; - 每一次基于筛选输入特征的维度修改模型结构参数后,需要注意重申一下
optimizer
变量,因为optimizer
的声明中涉及model.parameters()
5.联系我们
Email: oceannedlg@outlook.com
文章来源:https://blog.csdn.net/AAAAshin/article/details/135092644
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!