This paper is a paper. Attention Is All You Need (Transformer) Of Implementation of Tensorflow . I found a Tensorflow implementation code on github. I ran the data processing part. The model trained my notebook could not run, so I only ran a step to get the translation situation and corresponding losses before training. I analyzed the results of others running with you.

The concrete model can be referred to by me. Another blog

Parameter definitions:

Let's first create a flags.py file to store the parameters needed later.

import tensorflow as tf

# app parameter
tf.app.flags.DEFINE_string('mode', 'train', 'mode to train/test')
tf.app.flags.DEFINE_string('dataset', 'dummy', 'dataset')

# model parameter
tf.app.flags.DEFINE_integer('stack_num', 6, 'stack num')
tf.app.flags.DEFINE_integer('d_model', 512, 'model dimension')
tf.app.flags.DEFINE_integer('d_k', 64, 'key dim')
tf.app.flags.DEFINE_integer('d_v', 64, 'value dim')
tf.app.flags.DEFINE_integer('h', 8, 'stack of multihead attention')

# train parmeter
tf.app.flags.DEFINE_integer('num_epochs', 5, 'num epochs')
tf.app.flags.DEFINE_integer('batch_size', 32, 'batch size')

tf.app.flags.DEFINE_float('dropout_keep', 0.9, 'dropout keep rate')
tf.app.flags.DEFINE_integer('pad_length', 60, 'pad length')
tf.app.flags.DEFINE_float('learn_rate', 1e-4, 'learn rate')
tf.app.flags.DEFINE_boolean('use_pretrained_vec', False, 'flag for pretrained vector')

FLAGS = tf.app.flags.FLAGS

Data preprocessing:

We use data sets for evaluating active IWSLT machine translation tasks: IWSLT 2016 German–English parallel corpus They are parallel data sets for building and testing MT systems. Establish preporcess.py to process data:

  • The training set includes "train.tags.de-en.en" (196884 sentences in English) and "train.tags.de-en.de" (393768 sentences in German).
  • We constructed a vocabulary of all the words that appeared: 58641 + 2 English words, 126797 + 2 German words.
  • According to the vocabulary, we indexed each sentence by word transformation and expressed the sentence with padding as equal length pad_length to facilitate subsequent mapping.
import re
import tensorflow as tf
import operator
from flags import FLAGS

word_dict_a = {'<EOS>': 0, '<SOS>': 1}
word_dict_b = {'<EOS>': 0, '<SOS>': 1}

def parser(data):
  # Regularize every sentence in a data set
  TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+", re.UNICODE)
  return TOKENIZER_RE.findall(data.lower())

def transform(arr, word_dict):
  # Converting words in training set into indexes facilitates the processing of subsequent mapping layer
  result = []
  for i in range(len(arr)):
    if arr[i] not in word_dict:
      word_dict[arr[i]] = len(word_dict)
    result.append(word_dict[arr[i]])
  return result

def padding(arr):
  # padding sentences so that each sentence is equal in length
  pad_length = FLAGS.pad_length
  result = tf.keras.preprocessing.sequence.pad_sequences([arr], pad_length, padding='post')
  return result[0]

# Define data preprocessing for training set
FLAGS.dataset = 'IWSLT16'
if FLAGS.dataset == 'dummy':
  file_base = './data/dummy/'
  if FLAGS.mode == 'train':
    file_a = file_base+'train.a.txt'
    file_b = file_base+'train.b.txt'
    file_a_out = file_base+'train.a.ids.txt'
    file_b_out = file_base+'train.b.ids.txt'
    file_vocab_a = file_base+'vocab.a.txt'
    file_vocab_b = file_base+'vocab.b.txt'
elif FLAGS.dataset == 'IWSLT16':
  file_base = './data/IWSLT16/'
  if FLAGS.mode == 'train':
    file_a = file_base+'train.tags.de-en.en'
    file_b = file_base+'train.tags.de-en.de'
    file_a_out = file_base+'train.en.ids.txt'
    file_b_out = file_base+'train.de.ids.txt'
    file_vocab_a = file_base+'vocab.en.txt'
    file_vocab_b = file_base+'vocab.de.txt'

for fin, fout, word_dict in [(file_a, file_a_out, word_dict_a),
    (file_b, file_b_out, word_dict_b)]:
  # Data preprocessing: converting words in training set into indexes and padding
  with open(fout, 'w') as f_out:
    with open(fin,'r',encoding='gbk',errors='ignore') as f:
      for line in f:
        if len(line) > 0 and line[0] == '<':
          continue
        word_ids = map(lambda x: str(x), padding(transform(parser(line), word_dict)))
        f_out.write(' '.join(word_ids) + '\n')

for file_vocab, word_dict in [(file_vocab_a, word_dict_a),
    (file_vocab_b, word_dict_b)]:
  # Building a vocabulary from words in a dictionary
  with open(file_vocab, 'w') as f_out:
    keys = list(map(lambda x: x[0],
        sorted(word_dict.items(), key=operator.itemgetter(1))))
    for key in keys:
      f_out.write(key + '\n')

Model

Model Building and Code Annotation

import tensorflow as tf
import numpy as np
from flags import FLAGS

'''
Transformer modules
'''

'''
residual connection
'''
def add_and_norm(x, sub_x):
  with tf.variable_scope('add_and_norm'):
    sub_x = tf.nn.dropout(sub_x, FLAGS.dropout_keep)
    # LayerNorm(x+Sublayer(x))
    return tf.contrib.layers.layer_norm(x + sub_x)

def feed_forward(x, d_ff=2048):
  output_dim = x.get_shape()[-1]
  with tf.variable_scope('feed_forward'):
    x = tf.layers.dense(x, d_ff, activation=tf.nn.relu)
    x = tf.layers.dense(x, output_dim)
    return x

'''
head_i=Attention(QW^Q,KW^K,VW^V)
'''
def multihead_attention_block(vk_input, q_input, 
    batch_size, pad_length, d_model, d_k, d_v, masked=False):
  with tf.variable_scope('multihead_attention'):
    K = tf.layers.dense(vk_input, d_k, name='K', activation=tf.nn.relu) # W^K
    V = tf.layers.dense(vk_input, d_v, name='V', activation=tf.nn.relu) # W^V
    Q = tf.layers.dense(q_input, d_k, name='Q', activation=tf.nn.relu) # W^Q

    '''
    Scaled Dot-Product Attention
    '''
    # Mask (pad_length x pad_length)
    mask = tf.ones([pad_length, pad_length])
    if masked == True:
      mask = tf.linalg.LinearOperatorLowerTriangular(mask, tf.float32).to_dense()
      #mask = tf.contrib.linalg.LinearOperatorTriL(mask, tf.float32).to_dense()
    mask = tf.reshape(tf.tile(mask, [batch_size, 1]),
        [batch_size, pad_length, pad_length])

    # Attention(Q,K,V)=softmax[(QK^T)/d_k^(1/2)]V
    attn = tf.nn.softmax(
        mask * (Q @ tf.transpose(K, [0, 2, 1])) / tf.sqrt(tf.to_float(d_k))) @ V

    return attn

'''
MultiHead(Q,K,V)=Concat(head_1,...,head_h)W^O
'''
def multihead_attention(vk_input, q_input, masked=False):
  outputs = []

  pad_length = FLAGS.pad_length
  batch_size = tf.shape(vk_input)[0]
  d_model = FLAGS.d_model
  d_k = FLAGS.d_k
  d_v = FLAGS.d_v
  h = FLAGS.h

  for i in range(h):
    outputs.append(
        multihead_attention_block(vk_input, q_input,
          batch_size, pad_length, d_model, d_k, d_v, masked=masked))
  outputs = tf.concat(outputs, axis=2)
  outputs = tf.layers.dense(outputs, d_model)
  return outputs

'''
Transformer Encoder block
//Two sub-layers: (1) multi-head self-attention mechanism; (2) fully connected forward network
//Sublayer uses residual connection and layer normalization
'''
def encoder_block(inputs):
  # load hyper parameters

  with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
    flow = multihead_attention(inputs, inputs) # encoder self-attention
    flow = add_and_norm(inputs, flow)
    flow = add_and_norm(flow, feed_forward(flow))  # Fully Connected Forward Network
    return flow

'''
Transformer Decoder block
//Three sub-layers: 1) multi-head self-attention mechanism, 2) encoder-decoder attention, 3) fully connected forward network
//Sublayer uses residual connection and layer normalization
'''
def decoder_block(outputs, encoder_outputs):
  # load hyper parameters

  with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
    flow = multihead_attention(outputs, outputs, masked=True) # decoder self-attention
    flow = add_and_norm(outputs, flow)
    flow = add_and_norm(flow, multihead_attention(encoder_outputs, flow)) # encoder-decoder attention
    flow = add_and_norm(flow, feed_forward(flow))
    return flow

'''
Positional Encoding
PE(pos,2i)=sin[pos/10000^(2i/d_model)]
PE(pos,2i+1)=cos[pos/10000^(2i/d_model)]
//Where pos - > position, I - > dimension
'''
def positional_encoding(x):
  pad_length = FLAGS.pad_length
  d_model = FLAGS.d_model

  def sincos(x, i):
    if i%2 == 0:
      return np.sin(x)
    return np.cos(x)

  with tf.variable_scope('positional_encoding'):
    pe = tf.convert_to_tensor([sincos(pos/(10000**(2*i/d_model)), i)
      for pos in range(1, pad_length+1) for i in range(1, d_model+1)])
    pe = tf.reshape(pe, [-1, pad_length, d_model])
    return tf.add(x, pe)

'''
Transformer class
'''
class Transformer(object):
  def __init__(self, inputs=None, outputs=None, sparse_outputs=None):
    pad_length = FLAGS.pad_length
    d_model = FLAGS.d_model

    # Define placeholders for encoder input inputs
    if inputs is None:
      self.inputs = tf.placeholder(tf.float32, shape=[None, pad_length, d_model])
    else:
      self.inputs = inputs

    # Define placeholders for decoder input outputs
    if outputs is None:
      self.outputs = tf.placeholder(tf.float32, shape=[None, pad_length, d_model])
    else:
      self.outputs = outputs

    if sparse_outputs is None:
      self.sparse_outputs = tf.placeholder(tf.int32, shape=[None, pad_length])
    else:
      self.sparse_outputs = sparse_outputs

  def build_graph(self, output_dim):
    pad_length = FLAGS.pad_length
    N = FLAGS.stack_num
    learn_rate = FLAGS.learn_rate

    with tf.variable_scope('transformer'):
      # inputs (encoder), outputs (decoder) input for embedding and positional encoding
      inputs = positional_encoding(self.inputs)
      outputs = positional_encoding(self.outputs)

      # Six identical coding layers
      for i in range(N):
        with tf.variable_scope('enc_b_' + str(i)):
          inputs = encoder_block(inputs)

      # Six identical decoding layers
      for i in range(N):
        with tf.variable_scope('dec_b_' + str(i)):
          outputs = decoder_block(outputs, inputs)

      # Using Linear Function and softmax Function
      # Converting decoder output into predicted next-token probability
      with tf.variable_scope('projection'):
        self.logits = tf.layers.dense(outputs, output_dim)
        self.predict = tf.argmax(self.logits, axis=2)

      # Definition of loss and mask treatment
      with tf.variable_scope('loss'):
        EOS_ID = 0
        target_lengths = tf.reduce_sum(
            tf.to_int32(tf.not_equal(self.sparse_outputs, EOS_ID)), 1) + 1
        seq_mask = tf.sequence_mask(lengths=target_lengths,
            maxlen=pad_length,
            dtype=tf.float32)
        y_ = tf.one_hot(self.sparse_outputs, depth=output_dim)
        self.debug = self.logits
        ys = y_.get_shape().as_list()[-1]
        y_ = ((1-0.1) * y_) + (0.1 / ys)

        self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=y_)*seq_mask
        self.loss = tf.reduce_sum(self.loss, axis=1) / tf.to_float(target_lengths)
        self.loss = tf.reduce_mean(self.loss)

      tf.summary.scalar('loss', self.loss)

      # Adam Optimizer Using Adaptive Optimizer
      optimizer = tf.train.AdamOptimizer(learn_rate, beta1=0.9, beta2=0.98, epsilon=1e-8)
      self.optimize_op = optimizer.minimize(self.loss)

      # loss is recorded to facilitate observation of model training results in tensorboard
      self.summary_op = tf.summary.merge_all()

model training

import tensorflow as tf
from flags import FLAGS
from model import Transformer
from reader import data, source_vocab, target_vocab

input_vocab_size = len(source_vocab) # encoder Input Dictionary
output_vocab_size = len(target_vocab) # Dictionary of decoder input outputs

# Data initialization
initializer = tf.contrib.layers.xavier_initializer()
embedding_i = tf.get_variable('embedding_i', shape=[input_vocab_size,
  FLAGS.d_model], initializer=initializer)
embedding_o = tf.get_variable('embedding_o', shape=[output_vocab_size,
  FLAGS.d_model], initializer=initializer)
# Get the next data pair and get the corresponding input and output
inputs_op, outputs_op = data.get_next()
embed_inputs_op = tf.nn.embedding_lookup(embedding_i, inputs_op)
embed_outputs_op = tf.nn.embedding_lookup(embedding_o, outputs_op)

# Load Transformer
if FLAGS.use_pretrained_vec == True:
  model = Transformer()
else:
  model = Transformer(inputs=embed_inputs_op, outputs=embed_outputs_op,
      sparse_outputs=outputs_op)

model.build_graph(output_vocab_size)

# Training process
with tf.Session() as sess:
  sess.run([tf.global_variables_initializer(), data.initializer])
  train_writer = tf.summary.FileWriter('./summary/train', sess.graph)

  step = 0
  feed_dict = {}

  while True:
    try:
      # Model Reading Training Data
      # The corresponding summary, loss, prediction are obtained by using the optimizer training model.
      if FLAGS.use_pretrained_vec == True:
        inputs, outputs, embed_inputs, embed_outputs = sess.run(
            [inputs_op, outputs_op, embed_inputs_op, embed_outputs_op])
        feed_dict = {model.inputs: embed_inputs,
          model.outputs: embed_outputs, model.sparse_outputs: outputs}
        _, summary, loss, predict = sess.run([model.optimize_op,
          model.summary_op, model.loss, model.predict],
          feed_dict=feed_dict)
      else:
        _, summary, loss, predict, inputs, outputs = sess.run([model.optimize_op,
          model.summary_op, model.loss, model.predict, inputs_op, outputs_op],
          feed_dict=feed_dict)

      if step % 77 == 0:
        train_writer.add_summary(summary, step)

        predict = predict.tolist()
        original = []
        result = []

        for p_i in predict[0]:
          result.append(target_vocab[p_i])
        for p_i in outputs[0]:
          original.append(target_vocab[p_i])

        if '<EOS>' in result:
          result = result[:result.index('<EOS>')]
        if '<EOS>' in original:
          original = original[:original.index('<EOS>')]

        original = ' '.join(original)
        result = ' '.join(result)


        print('step:'+str(step)+', loss: ' + str(loss))
        print(original)
        print(result)
        print('---')

      step += 1
    except tf.errors.OutOfRangeError:
      print('train done')
      break

Result

I ran the translation results and corresponding loss when the model was not trained: