发布时间:2024-11-04 16:48:34
本内容由, 集智官方收集发布,仅供参考学习,不代表集智官方赞同其观点或证实其内容的真实性准确性,请勿用于商业用途。
以下是一个基于RNN + 注意力机制的中日文翻译模型训练的代码博客。这个示例将逐步引导您使用TensorFlow和Keras构建一个基于双向LSTM的神经机器翻译模型,并加入了注意力机制来增强模型的翻译能力。
首先,安装必要的库:
pip install tensorflow pandas sklearn
假设我们有一个 xlsx
文件(例如 translation_dataset.xlsx
),包含如下字段:id
、日文翻译
、中文原句
、小说名称
、小说作者
、分词
、章节id
、预置状态
。我们将加载这个文件并提取中日文句子,进行数据的分词和处理。
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
# 加载数据集
data = pd.read_excel('translation_dataset.xlsx')
# 提取中文和日文文本
japanese_texts = data['日文翻译'].values
chinese_texts = data['中文原句'].values
# 分割数据集
train_japanese, val_japanese, train_chinese, val_chinese = train_test_split(japanese_texts, chinese_texts, test_size=0.2, random_state=42)
为了让模型能够理解文本,需要先将句子转化为序列格式。我们使用 Tokenizer
为中日文本构建分词器,并将文本转化为整数序列。
# 使用 tf.keras.preprocessing.text.Tokenizer 构建分词器
def build_tokenizer(texts, max_vocab_size=10000, oov_token='<OOV>'):
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(texts)
return tokenizer
# 构建中日文分词器
japanese_tokenizer = build_tokenizer(train_japanese)
chinese_tokenizer = build_tokenizer(train_chinese)
# 将文本转换为序列
def tokenize_texts(tokenizer, texts, max_len=50):
sequences = tokenizer.texts_to_sequences(texts)
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding='post')
return sequences
# 将训练集和验证集转换为序列
train_japanese_seq = tokenize_texts(japanese_tokenizer, train_japanese)
val_japanese_seq = tokenize_texts(japanese_tokenizer, val_japanese)
train_chinese_seq = tokenize_texts(chinese_tokenizer, train_chinese)
val_chinese_seq = tokenize_texts(chinese_tokenizer, val_chinese)
# 词汇表大小和最大序列长度
japanese_vocab_size = len(japanese_tokenizer.word_index) + 1
chinese_vocab_size = len(chinese_tokenizer.word_index) + 1
max_seq_len = 50
使用双向LSTM来构建编码器和解码器,并加入注意力机制来帮助模型更好地捕获句子的上下文信息。
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Bidirectional, Concatenate
from tensorflow.keras.models import Model
# 定义注意力机制
class Attention(tf.keras.layers.Layer):
def __init__(self, units):
super(Attention, self).__init__()
self.W1 = Dense(units)
self.W2 = Dense(units)
self.V = Dense(1)
def call(self, encoder_output, decoder_hidden):
# 计算注意力权重
decoder_hidden_with_time_axis = tf.expand_dims(decoder_hidden, 1)
score = self.V(tf.nn.tanh(self.W1(encoder_output) + self.W2(decoder_hidden_with_time_axis)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * encoder_output
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
# 定义编码器
class Encoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = Embedding(vocab_size, embedding_dim)
self.lstm = Bidirectional(LSTM(enc_units, return_sequences=True, return_state=True))
def call(self, x):
x = self.embedding(x)
output, forward_h, forward_c, backward_h, backward_c = self.lstm(x)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
return output, state_h, state_c
# 定义解码器
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = Embedding(vocab_size, embedding_dim)
self.lstm = LSTM(dec_units * 2, return_sequences=True, return_state=True)
self.fc = Dense(vocab_size)
self.attention = Attention(dec_units)
def call(self, x, enc_output, hidden):
context_vector, attention_weights = self.attention(enc_output, hidden)
x = self.embedding(x)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
output, state_h, state_c = self.lstm(x)
output = tf.reshape(output, (-1, output.shape[2]))
x = self.fc(output)
return x, state_h, state_c, attention_weights
# 构建模型
embedding_dim = 256
units = 512
batch_size = 64
encoder = Encoder(japanese_vocab_size, embedding_dim, units, batch_size)
decoder = Decoder(chinese_vocab_size, embedding_dim, units, batch_size)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
# 损失函数定义
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
# 训练步骤
@tf.function
def train_step(inp, targ, enc_hidden):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden_h, enc_hidden_c = encoder(inp)
dec_hidden = enc_hidden_h
dec_input = tf.expand_dims([chinese_tokenizer.word_index['<start>']] * batch_size, 1)
for t in range(1, targ.shape[1]):
predictions, dec_hidden, _, _ = decoder(dec_input, enc_output, dec_hidden)
loss += loss_function(targ[:, t], predictions)
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
将数据集构建为批处理数据集,并开始训练模型。
BUFFER_SIZE = len(train_japanese_seq)
steps_per_epoch = BUFFER_SIZE // batch_size
# 将数据集转换为TensorFlow数据集
train_dataset = tf.data.Dataset.from_tensor_slices((train_japanese_seq, train_chinese_seq)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
# 开始训练
EPOCHS = 20
for epoch in range(EPOCHS):
total_loss = 0
for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, encoder)
total_loss += batch_loss
print(f'Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f}')
在训练结束后,我们可以测试模型的翻译效果。
def translate(sentence):
sentence_seq = tokenize_texts(japanese_tokenizer, [sentence], max_len=max_seq_len)
enc_output, enc_hidden_h, _ = encoder(sentence_seq)
dec_hidden = enc_hidden_h
dec_input = tf.expand_dims([chinese_tokenizer.word_index['<start>']], 0)
result = []
for t in range(max_seq_len):
predictions, dec_hidden, _, _ = decoder(dec_input, enc_output, dec_hidden)
predicted_id = tf.argmax(predictions[0]).numpy()
result.append(chinese_tokenizer.index_word.get(predicted_id, '<unk>'))
if chinese_tokenizer.index_word.get(predicted_id) == '<end>':
break
dec_input = tf.expand_dims([predicted_id], 0)
return ' '.join(result)
# 测试翻译
test_sentence = "これはテストの文章です。" # 假设这是日文输入
print("翻译结果:", translate(test_sentence))
SparseCategoricalCrossentropy
作为损失函数,通过掩码忽略填充标记对损失的影响。通过上述步骤,您可以训练一个中日文本翻译模型。这个模型实现了实际应用中的中日文本翻译,可用于多语言NLP任务的开发与评估。
这类数据集包含成对或多对语言的文本样本,每一对文本表示相同内容的不同语言版本。目的是训练机器翻译模型,使其能够将一种语言的文本准确地翻译成另一种语言。用于开发和优化自动翻译系统,提高跨语言沟通的效率和准确性。