Show and Tell:lessons learned from the 2015 Mscoco
Image Captioning Challenge Code
The Image caption task is given an image that describes the information contained in an image. It contains two aspects, image feature extraction and statement sequence description, in which CNN and RNN play an important role.
The following figure is the first input of the 4096-dimensional image feature extracted by CNN as LSTM, and the content of the image is described as input to other time series. LSTM is mainly used to learn the dependency information between each word in the corresponding statement.
The exact expression is shown in the following illustration:
As shown in the following illustration, you need to encode the image features of high-dimensional 4096 and each one-hot word into a fixed dimension, and then use descriptive information as input to the LSTM
The specific code is as follows
#-*-coding:utf-8-*-Import Math import os import tensorflow as TF import numpy as NP import pandas as PD import Cpickle From TENSORFLOW.MODELS.RNN import Rnn_cell import tensorflow.python.platform from keras.preprocessing import sequence fr Om Collections Import Counter from cnn_util Import * Class Caption_generator (): Def init_weight (self, dim_in, Dim_ou T, Name=none, stddev=1.0): return TF. Variable (Tf.truncated_normal ([Dim_in, Dim_out], stddev=stddev/math.sqrt (float (dim_in)), name=name) def Init_bias ( Self, Dim_out, Name=none): return TF. Variable (Tf.zeros ([Dim_out]), name=name) def __init__ (self, dim_image, dim_embed, Dim_hidden, Batch_size, N_lstm_step S, N_words, bias_init_vector=none): Self.dim_image = Np.int (dim_image) # image Dimension self.dim_embed = Np.int (Dim _embed) # dim_embed and Dim_hidden are the same dimensions for encoding the word vector Self.dim_hidden = Np.int (dim_hidden) # Lstmcell The number of neurons self.ba Tch_size = Np.int (batch_size) self.n_lstm_steps = nP.int (n_lstm_steps) #序列长度 self.n_words = Np.int (n_words) with Tf.device ("/cpu:0"): #编码词向量到固定为度 Self. WEMB = tf. Variable (Tf.random_uniform ([N_words, dim_embed], -0.1, 0.1), name= ' wemb ') Self.bemb = Self.init_bias (dim_embed, n Ame= ' bemb ') self.lstm = Rnn_cell. Basiclstmcell (Dim_hidden) #初始化LSTMCell #self. encode_img_w = Self.init_weight (Dim_image, Dim_hidden, Name= ' encode_i Mg_w ') #编码图像特征到固定为度 self.encode_img_w = tf. Variable (Tf.random_uniform ([Dim_image, Dim_hidden], -0.1, 0.1), name= ' encode_img_w ') Self.encode_img_b = Self.init _bias (Dim_hidden, name= ' Encode_img_b ') #将输出的结果恢复到词向量空间 self.embed_word_w = tf. Variable (Tf.random_uniform ([Dim_hidden, N_words], -0.1, 0.1), name= ' Embed_word_w ') if Bias_init_vector is not Non E:self.embed_word_b = tf. Variable (Bias_init_vector.astype (np.float32), name= ' embed_word_b ') else:self.embed_word_b = Self.init _bias (n_words, name= ' embed_word_b ') def Build_model (self): #用于train image = Tf.placeholder (Tf.float32, [Self.batch_size, Self.di M_image]) #图像维度 sentence = Tf.placeholder (Tf.int32, [Self.batch_size, Self.n_lstm_steps]) #对应的图像标签 (statement information description) m Ask = Tf.placeholder (Tf.float32, [Self.batch_size, Self.n_lstm_steps]) #mask (not 1 0) for the final calculation of loss #将图像特征编码到固定的 Dimension image_emb = Tf.matmul (image, self.encode_img_w) + Self.encode_img_b # (batch_size, dim_hidden) state = Initialization of Tf.zeros ([Self.batch_size, Self.lstm.state_size]) # state, State_size size 2 (c,h), initialized to 0 loss = 0.0 with T
F.variable_scope ("RNN"): For I in Range (self.n_lstm_steps): # MaxLen + 1 If i = 0: Current_emb = Lstm of image_emb# t=-1 moments: Image encoding features Else:with tf.device ("/cpu:0" ): #词向量编码固定维度, here Sentence[:,i-1] I-1 represents the first word CURRENT_EMB = Tf.nn.embedding_lookup (self.
WEMB, sentence[:,i-1]) + SELF.BEMB If i > 0:tf.get_variable_scope (). Reuse_variables () #LSTM的输出状态信息 output , state = Self.lstm (Current_emb, State) # (Batch_size, Dim_hidden) If i > 0: #lab
Els represents the first word of each batch labels = tf.expand_dims (sentence[:, I], 1) # (Batch_size)--> (batch_size,1) #indices表示对于labels的索引 indices = tf.expand_dims (Tf.range (0, Self.batch_size, 1), 1) # (BAT ch_size) #-->batch_size,1) # (batch_indices,labels) concated = Tf.concat (1, [Indic
ES, labels]) #将concated编码为one-hot matrix, 1 indicates tf.pack ([Self.batch_size, Self.n_words] is specified for index 1, others are set to 0 Onehot_labels = Tf.sparse_to_dense (concated, Tf.pack ([Self.batch_size, Self.n_wor DS]), 1.0, 0.0) # (batch_size, n_words) #LSTM的输出结果 logit_words = Tf.matmul (output, Self.embed_word_w) + SELf.embed_word_b # (Batch_size, n_words) #计算Loss损失结果 cross_entropy = Tf.nn.softmax_
Cross_entropy_with_logits (Logit_words, onehot_labels) cross_entropy = cross_entropy * Mask[:,i] #计算损失
Current_loss = tf.reduce_sum (cross_entropy) loss = loss + Current_loss Loss = Loss/tf.reduce_sum (mask[:,1:]) return loss, image, sentence, Mask def build_generator (self, MA Xlen): #用于test阶段, MaxLen represents the maximum length of the resulting statement for the output image = Tf.placeholder (Tf.float32, [1, self.dim_image]) Image_emb = t F.matmul (image, self.encode_img_w) + self.encode_img_b state = Tf.zeros ([1, self.lstm.state_size]) #last_ Word = image_emb Generated_words = [] with Tf.variable_scope ("RNN"): #t =-1 Time access information for Image_featu Re output, state = Self.lstm (IMAGE_EMB, state) #self. WEMB denotes the embed of the word vector (n_word,embed) Last_word = tf.nn.embedding_lookup (self.
WEMB, [0]) + self.bemb for I in Range (MaxLen): Tf.get_variable_scope (). Reuse_variables ()
#t =0 time lstm output, state = Self.lstm (Last_word, State) #将输出结果映射到原始的词向量空间 Logit_words = Tf.matmul (output, self.embed_word_w) + self.embed_word_b #计算last_word和image为前 Under the condition, the maximum probability of the current moment can be calculated.
Max_prob_word is the index Max_prob_word = Tf.argmax (logit_words, 1) with Tf.device ("/cpu:0"): Last_word = Tf.nn.embedding_lookup (self.
WEMB, Max_prob_word) Last_word + + self.bemb generated_words.append (Max_prob_word) return image, Generated_words