当使用多个gpu训练时,输入数据为batch_size*num_gpu,这样模型训练时间可以大大较小.
tensorflow中使用制定gpu可以通过tf.device()实现.例如我想使用0号显卡:
gpu_ind=0 with tf.device("/gpu:{}".format(gpu_ind))下面介绍一下多gpu模型训练.代码参考自OpenSeq2Seq:https://github.com/NVIDIA/OpenSeq2Seq
关于多gpu模型定义文件为OpenSeq2Seq/model/model_base.py
首先将定义输入数据,并拆分为多个gpu的输入:
# placeholders for feeding data self.x = tf.placeholder(tf.int32, [self.global_batch_size, None]) self.x_length = tf.placeholder(tf.int32, [self.global_batch_size]) self.y = tf.placeholder(tf.int32, [self.global_batch_size, None]) self.y_length = tf.placeholder(tf.int32, [self.global_batch_size]) # below we follow data parallelism for multi-GPU training # actual per GPU data feeds xs = tf.split(value=self.x, num_or_size_splits=num_gpus, axis=0) x_lengths = tf.split(value=self.x_length, num_or_size_splits=num_gpus, axis=0) ys = tf.split(value=self.y, num_or_size_splits=num_gpus, axis=0) y_lengths = tf.split(value=self.y_length, num_or_size_splits=num_gpus, axis=0)对于num_gpu个gpu,分别采用tf.device()指定显卡号,定义模型损失函数,更新策略等,代码如下:
eval_ops = [] losses = [] for gpu_ind in range(0, num_gpus): with tf.device("/gpu:{}".format(gpu_ind)), tf.variable_scope( name_or_scope=tf.get_variable_scope(), # re-using variables across GPUs. reuse=force_var_reuse or (gpu_ind > 0)): deco_print("Building graph on GPU:{}".format(gpu_ind)) if self.mode == "train" or self.mode == "eval": sample_ops, loss_i = self._build_forward_pass_graph(source_sequence = xs[gpu_ind], src_length=x_lengths[gpu_ind], target_sequence = ys[gpu_ind], tgt_length=y_lengths[gpu_ind], gpu_id=gpu_ind) losses.append(loss_i) if self.mode == "eval": eval_ops.append(sample_ops) elif self.mode == "infer": self._build_forward_pass_graph(source_sequence = xs[gpu_ind], src_length=x_lengths[gpu_ind], gpu_id=gpu_ind) else: raise ValueError("Unknown mode") # end of for gpu_ind loop if self.mode != "infer": self._eval_ops = eval_ops self._eval_y = ys self.loss = tf.reduce_mean(losses) def exp_decay(learning_rate, var_global_step): new_lr = tf.train.exponential_decay(learning_rate=learning_rate, global_step=var_global_step, decay_steps=self.model_params['decay_steps'], decay_rate=self.model_params['decay_rate'], staircase=self.model_params['use_staircase_decay']) boundaries = [self.model_params['begin_decay_at']] values = [learning_rate, new_lr] min_rate = self.model_params['min_learning_rate'] final_lr = tf.maximum(tf.train.piecewise_constant( x=tf.to_int32(var_global_step), boundaries=boundaries, values=values), min_rate) self._lr = final_lr return final_lr lr_decay_fn = exp_decay if 'use_decay' in self.model_params and self.model_params['use_decay'] == True else None if self.model_params['optimizer'].lower() == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate=self.model_params['learning_rate'], momentum=0.9 if 'opt_momentum' not in self.model_params else self.model_params['opt_momentum']) else: optimizer = self.model_params['optimizer'] if self._mode == "train": self._lr = tf.Variable(initial_value=self.model_params['learning_rate'], trainable=False) self.train_op = tf.contrib.layers.optimize_loss( loss = self.loss, global_step = tf.contrib.framework.get_global_step(), learning_rate = self.model_params['learning_rate'], optimizer = optimizer, gradient_noise_scale = None, gradient_multipliers = None, clip_gradients = None if 'max_grad_norm' not in self.model_params else self.model_params['max_grad_norm'], learning_rate_decay_fn = lr_decay_fn, update_ops = None, variables = None, name = "Loss_Optimization", summaries=["learning_rate", "loss", "gradients", "gradient_norm"], colocate_gradients_with_ops = True, increment_global_step = True ) print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') deco_print("Trainable variables:") total_params = 0 for var in tf.trainable_variables(): var_params = 1 for dim in var.get_shape(): var_params *= dim.value total_params += var_params print('Name: {} | Shape: {} | Dtype: {}'.format(var.name, tf.shape(var), var.dtype)) deco_print('Total trainable parameters: %d' % total_params)