1.pure seq2seq
enc_inputs = tf.placeholder( tf.int32, shape=[None, enc_sentence_length], name='input_sentences') #enc_sentence_length:为encoder端的time_setp数 sequence_lengths = tf.placeholder( tf.int32, shape=[None], name='sentences_length') #为一个batch中的每一个sentence长度 dec_inputs = tf.placeholder( tf.int32, shape=[None, dec_sentence_length+1], name='output_sentences') #dec_sentence_length+1为decoder端的time_step数 # batch major => time major enc_inputs_t = tf.transpose(enc_inputs, perm=[1,0]) dec_inputs_t = tf.transpose(dec_inputs, perm=[1,0])#先把time_step的维度调至第一维 with tf.device('/cpu:0'): enc_Wemb = tf.get_variable('enc_word_emb', initializer=tf.random_uniform([enc_vocab_size+1, enc_emb_size]))#encoder端的embed矩阵 dec_Wemb = tf.get_variable('dec_word_emb', initializer=tf.random_uniform([dec_vocab_size+2, dec_emb_size]))#decoder端的embed矩阵 dec_out_W = tf.get_variable('dec_out_W', initializer=tf.random_uniform([hidden_size, dec_vocab_size+2])) dec_out_b = tf.get_variable('dec_out_b', initializer=tf.random_uniform([dec_vocab_size+2])) with tf.variable_scope('encoder'): enc_emb_inputs = tf.nn.embedding_lookup(enc_Wemb, enc_inputs_t) # enc_emb_inputs: # list(enc_sent_len) of tensor[batch_size x embedding_size] # Because `static_rnn` takes list inputs enc_emb_inputs = tf.unstack(enc_emb_inputs) enc_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) # enc_sent_len x batch_size x embedding_size enc_outputs, enc_last_state = tf.contrib.rnn.static_rnn( cell=enc_cell, inputs=enc_emb_inputs, sequence_length=sequence_lengths, dtype=tf.float32) dec_outputs = [] dec_predictions = [] with tf.variable_scope('decoder') as scope: dec_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) for i in range(dec_sentence_length+1): if i == 0: input_ = tf.nn.embedding_lookup(dec_Wemb, dec_inputs_t[i]) state = enc_last_state else: scope.reuse_variables() input_ = tf.nn.embedding_lookup(dec_Wemb, dec_prediction) # dec_output: batch_size x dec_vocab_size+2 # state: batch_size x hidden_size dec_output, state = dec_cell(input_, state) dec_output = tf.nn.xw_plus_b(dec_output, dec_out_W, dec_out_b) # dec_prediction: batch_size x 1 dec_prediction = tf.argmax(dec_output, axis=1) dec_outputs.append(dec_output) dec_predictions.append(dec_prediction) # predictions: [batch_size x dec_sentence_lengths+1] predictions = tf.transpose(tf.stack(dec_predictions), [1,0]) # labels & logits: [dec_sentence_length+1 x batch_size x dec_vocab_size+2] labels = tf.one_hot(dec_inputs_t, dec_vocab_size+2) logits = tf.stack(dec_outputs) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits)) # training_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss) training_op = tf.train.RMSPropOptimizer(learning_rate=0.0001).minimize(loss) 2.projection和embed wrapping的纯seq2seq enc_inputs = tf.placeholder( tf.int32, shape=[None, enc_sentence_length], name='input_sentences') sequence_lengths = tf.placeholder( tf.int32, shape=[None], name='sentences_length') dec_inputs = tf.placeholder( tf.int32, shape=[None, dec_sentence_length+1], name='output_sentences') # batch major => time major enc_inputs_t = tf.transpose(enc_inputs, [1,0]) dec_inputs_t = tf.transpose(dec_inputs, [1,0]) with tf.variable_scope('encoder'): enc_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) enc_cell = EmbeddingWrapper(enc_cell, enc_vocab_size+1, enc_emb_size) # enc_sent_len x batch_size x embedding_size enc_outputs, enc_last_state = tf.contrib.rnn.static_rnn( cell=enc_cell, inputs=tf.unstack(enc_inputs_t), sequence_length=sequence_lengths, dtype=tf.float32) dec_outputs = [] dec_predictions = [] with tf.variable_scope('decoder') as scope: dec_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) dec_cell = EmbeddingWrapper(dec_cell, dec_vocab_size+2, dec_emb_size) dec_cell = OutputProjectionWrapper(dec_cell, dec_vocab_size+2) for i in range(dec_sentence_length+1): if i == 0: input_ = dec_inputs_t[i] state = enc_last_state else: scope.reuse_variables() input_ = dec_prediction # dec_output: batch_size x dec_vocab_size+2 # state: batch_size x hidden_size dec_output, state = dec_cell(input_, state) # dec_prediction: batch_size x 1 dec_prediction = tf.argmax(dec_output, axis=1) dec_outputs.append(dec_output) dec_predictions.append(dec_prediction) # predictions: [batch_size x dec_sentence_lengths+1] predictions = tf.transpose(tf.stack(dec_predictions), [1,0]) # labels & logits: [dec_sentence_length+1 x batch_size x dec_vocab_size+2] labels = tf.one_hot(dec_inputs_t, dec_vocab_size+2) logits = tf.stack(dec_outputs) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits)) # training_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss) training_op = tf.train.RMSPropOptimizer(learning_rate=0.0001).minimize(loss)