pytorch 多gpu训练:
# -*- coding:utf-8 -*-
from __future__
import division
import datetime
import torch
import torch.nn
as nn
import torch.nn.functional
as F
from torch.autograd
import Variable
import numpy
as np
from PIL
import Image
from utils.parse_config
import *
from utils.utils
import build_targets
from collections
import defaultdict
def create_modules(module_defs):
"""
Constructs module list of layer blocks from module configuration in module_defs
"""
#根据cfg文件建立yolov3网络结构
hyperparams = module_defs.pop(
0)
output_filters = [
int(hyperparams[
'channels'])]
module_list = nn.ModuleList()
for i
, module_def
in enumerate(module_defs):
modules = nn.Sequential()
if module_def[
'type'] ==
'convolutional':
bn =
int(module_def[
'batch_normalize'])
filters =
int(module_def[
'filters'])
kernel_size =
int(module_def[
'size'])
pad = (kernel_size -
1) //
2 if int(module_def[
'pad'])
else 0
modules.add_module(
'conv_%d' % i
, nn.Conv2d(
in_channels=output_filters[-
1]
,
out_channels=filters
,
kernel_size=kernel_size
,
stride=
int(module_def[
'stride'])
,
padding=pad
,
bias=
not bn))
if bn:
modules.add_module(
'batch_norm_%d' % i
, nn.BatchNorm2d(filters))
if module_def[
'activation'] ==
'leaky':
modules.add_module(
'leaky_%d' % i
, nn.LeakyReLU(
0.1))
elif module_def[
'type'] ==
'upsample':
upsample = nn.Upsample(
scale_factor=
int(module_def[
'stride'])
,
mode=
'nearest')
modules.add_module(
'upsample_%d' % i
, upsample)
elif module_def[
'type'] ==
'route':
layers = [
int(x)
for x
in module_def[
"layers"].split(
',')]
filters =
sum([output_filters[layer_i]
for layer_i
in layers])
modules.add_module(
'route_%d' % i
, EmptyLayer())
elif module_def[
'type'] ==
'shortcut':
filters = output_filters[
int(module_def[
'from'])]
modules.add_module(
"shortcut_%d" % i
, EmptyLayer())
elif module_def[
"type"] ==
"yolo":
anchor_idxs = [
int(x)
for x
in module_def[
"mask"].split(
",")]
# Extract anchors
anchors = module_def[
"anchors"]
anchors = [anchors[i]
for i
in anchor_idxs]
num_classes =
int(module_def[
'classes'])
img_height =
int(hyperparams[
'height'])
# Define detection layer
yolo_layer = YOLOLayer(anchors
, num_classes
, img_height)
modules.add_module(
'yolo_%d' % i
, yolo_layer)
# Register module list and number of output filters
module_list.append(modules)
output_filters.append(filters)
return hyperparams
, module_list
class EmptyLayer(nn.Module):
"""Placeholder for 'route' and 'shortcut' layers"""
def __init__(
self):
super(EmptyLayer
, self).
__init__()
class YOLOLayer(nn.Module):
"""Detection layer"""
def __init__(
self, anchors
, num_classes
, image_dim):
super(YOLOLayer
, self).
__init__()
self.anchors = anchors
self.scaled_anchors =
None
self.num_anchors =
len(anchors)
self.num_classes = num_classes
self.bbox_attrs =
5 + num_classes
self.image_dim = image_dim
self.ignore_thres =
0.5
self.coord_scale =
1
self.noobject_scale =
1
self.object_scale =
5
self.class_scale =
1
self.seen =
0
self.mse_loss = nn.MSELoss()
self.bce_loss = nn.BCELoss()
# self.bce_logits_loss = nn.BCEWithLogitsLoss()
def forward(
self, x
, targets=
None):
bs = x.size(
0)
g_dim = x.size(
2)
stride =
self.image_dim / g_dim
# Tensors for cuda support
FloatTensor = torch.cuda.FloatTensor
if x.is_cuda
else torch.FloatTensor
LongTensor = torch.cuda.LongTensor
if x.is_cuda
else torch.LongTensor
prediction = x.view(bs
, self.num_anchors
, self.bbox_attrs
, g_dim
, g_dim).permute(
0, 1, 3, 4, 2).contiguous()
# Get outputs
x = torch.sigmoid(prediction[...
, 0])
# Center x
y = torch.sigmoid(prediction[...
, 1])
# Center y
w = prediction[...
, 2]
# Width
h = prediction[...
, 3]
# Height
conf = torch.sigmoid(prediction[...
, 4])
# Conf
pred_cls = torch.sigmoid(prediction[...
, 5:])
# Cls pred.
# Calculate offsets for each grid
grid_x = torch.linspace(
0, g_dim-
1, g_dim).repeat(g_dim
,1).repeat(bs*
self.num_anchors
, 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(
0, g_dim-
1, g_dim).repeat(g_dim
,1).t().repeat(bs*
self.num_anchors
, 1, 1).view(y.shape).type(FloatTensor)
scaled_anchors = [(a_w / stride
, a_h / stride)
for a_w
, a_h
in self.anchors]
anchor_w = FloatTensor(scaled_anchors).index_select(
1, LongTensor([
0]))
anchor_h = FloatTensor(scaled_anchors).index_select(
1, LongTensor([
1]))
anchor_w = anchor_w.repeat(bs
, 1).repeat(
1, 1, g_dim*g_dim).view(w.shape)
anchor_h = anchor_h.repeat(bs
, 1).repeat(
1, 1, g_dim*g_dim).view(h.shape)
# Add offset and scale with anchors
pred_boxes = FloatTensor(prediction[...
, :
4].shape)
pred_boxes[...
, 0] = x.data + grid_x
pred_boxes[...
, 1] = y.data + grid_y
pred_boxes[...
, 2] = torch.exp(w.data) * anchor_w
pred_boxes[...
, 3] = torch.exp(h.data) * anchor_h
self.seen += prediction.size(
0)
# Training
if targets
is not None:
if x.is_cuda:
self.mse_loss =
self.mse_loss.cuda()
self.bce_loss =
self.bce_loss.cuda()
nGT
, nCorrect
, coord_mask
, conf_mask
, cls_mask
, tx
, ty
, tw
, th
, tconf
, tcls = build_targets(pred_boxes.cpu().data
,
targets.cpu().data
,
scaled_anchors
,
self.num_anchors
,
self.num_classes
,
g_dim
,
self.ignore_thres)
# nProposals = int((conf > 0.25).sum().item())
recall =
float(nCorrect / nGT)
if nGT
else 1
tx = Variable(tx.type(FloatTensor)
, requires_grad=
False)
ty = Variable(ty.type(FloatTensor)
, requires_grad=
False)
tw = Variable(tw.type(FloatTensor)
, requires_grad=
False)
th = Variable(th.type(FloatTensor)
, requires_grad=
False)
tconf = Variable(tconf.type(FloatTensor)
, requires_grad=
False)
tcls = Variable(tcls[cls_mask ==
1].type(FloatTensor)
, requires_grad=
False)
coord_mask = Variable(coord_mask.type(FloatTensor)
, requires_grad=
False)
conf_mask = Variable(conf_mask.type(FloatTensor)
, requires_grad=
False)
loss_x =
self.coord_scale *
self.mse_loss(x[coord_mask ==
1]
, tx[coord_mask ==
1]) /
2
loss_y =
self.coord_scale *
self.mse_loss(y[coord_mask ==
1]
, ty[coord_mask ==
1]) /
2
loss_w =
self.coord_scale *
self.mse_loss(w[coord_mask ==
1]
, tw[coord_mask ==
1]) /
2
loss_h =
self.coord_scale *
self.mse_loss(h[coord_mask ==
1]
, th[coord_mask ==
1]) /
2
loss_conf =
self.bce_loss(conf[conf_mask ==
1]
, tconf[conf_mask ==
1])
loss_cls =
self.class_scale *
self.bce_loss(pred_cls[cls_mask ==
1]
, tcls)
loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
return loss
, loss_x.item()
, loss_y.item()
, loss_w.item()
, loss_h.item()
, loss_conf.item()
, loss_cls.item()
,recall
else:
# If not in training phase return predictions
output = torch.cat((pred_boxes.view(bs
, -
1, 4) * stride
, conf.view(bs
, -
1, 1)
, pred_cls.view(bs
, -
1, self.num_classes))
, -
1)
return output.data
class Darknet(nn.Module):
"""YOLOv3 object detection model"""
def __init__(
self, module_defs
, img_size=
416):
super(Darknet
, self).
__init__()
self.module_defs = module_defs
self.hyperparams
, self.module_list = create_modules(
self.module_defs)
#根据config文件建立yolov3网络模型,返回网络参数和torch版神经网络
# print("module",self.module_list)
self.img_size = img_size
self.loss_names = [
'x', 'y', 'w', 'h', 'conf', 'cls', 'recall']
self.losses = defaultdict(
float)
def forward(
self, x
, targets=
None):
is_training = targets
is not None
output = []
for name
in self.loss_names:
self.losses[name] =
0
layer_outputs = []
for i
, (module_def
, module)
in enumerate(
zip(
self.module_defs
, self.module_list)):
if module_def[
'type']
in [
'convolutional', 'upsample']:
x = module(x)
elif module_def[
'type'] ==
'route':
layer_i = [
int(x)
for x
in module_def[
'layers'].split(
',')]
x = torch.cat([layer_outputs[i]
for i
in layer_i]
, 1)
elif module_def[
'type'] ==
'shortcut':
layer_i =
int(module_def[
'from'])
x = layer_outputs[-
1] + layer_outputs[layer_i]
elif module_def[
'type'] ==
'yolo':
# Train phase: get loss
if is_training:
x
, *losses = module[
0](x
, targets)
for name
, loss
in zip(
self.loss_names
, losses):
self.losses[name] += loss
# Test phase: Get detections
else:
x = module(x)
output.append(x)
layer_outputs.append(x)
self.losses[
'recall'] /=
3
if is_training:
return sum(output).view(-
1, )
else:
return torch.cat(output
, 1)
# return sum(output) if is_training else torch.cat(output, 1)
def load_weights(
self, weights_path
,is_training = False):
"""Parses and loads the weights stored in 'weights_path'"""
#Open the weights file
fp =
open(weights_path
, "rb")
header = np.fromfile(fp
, dtype=np.int32
, count=
5)
# First five are header values
# Needed to write header when saving weights
self.header_info = header
self.seen = header[
3]
weights = np.fromfile(fp
, dtype=np.float32)
# The rest are weights
fp.close()
ptr =
0
for i
, (module_def
, module)
in enumerate(
zip(
self.module_defs
, self.module_list)):
if module_def[
'type'] ==
'convolutional':
conv_layer = module[
0]
if module_def[
'batch_normalize']:
# Load BN bias, weights, running mean and running variance
bn_layer = module[
1]
num_b = bn_layer.bias.numel()
# Number of biases
# Bias
bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias)
bn_layer.bias.data.copy_(bn_b)
ptr += num_b
# Weight
bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight)
bn_layer.weight.data.copy_(bn_w)
ptr += num_b
# Running Mean
bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
bn_layer.running_mean.data.copy_(bn_rm)
ptr += num_b
# Running Var
bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
bn_layer.running_var.data.copy_(bn_rv)
ptr += num_b
else:
# Load conv. bias
num_b = conv_layer.bias.numel()
conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias)
conv_layer.bias.data.copy_(conv_b)
ptr += num_b
# Load conv. weights
num_w = conv_layer.weight.numel()
conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)
#权重参数赋值
conv_layer.weight.data.copy_(conv_w)
ptr += num_w
"""
@:param path - path of the new weights file
@:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
"""
def save_weights(
self, path
, cutoff=-
1):
fp =
open(path
, 'wb')
self.header_info[
3] =
self.seen
self.header_info.tofile(fp)
# Iterate through layers
for i
, (module_def
, module)
in enumerate(
zip(
self.module_defs[:cutoff]
, self.module_list[:cutoff])):
if module_def[
'type'] ==
'convolutional':
conv_layer = module[
0]
# If batch norm, load bn first
if module_def[
'batch_normalize']:
bn_layer = module[
1]
bn_layer.bias.data.cpu().numpy().tofile(fp)
bn_layer.weight.data.cpu().numpy().tofile(fp)
bn_layer.running_mean.data.cpu().numpy().tofile(fp)
bn_layer.running_var.data.cpu().numpy().tofile(fp)
# Load conv bias
else:
conv_layer.bias.data.cpu().numpy().tofile(fp)
# Load conv weights
conv_layer.weight.data.cpu().numpy().tofile(fp)
fp.close()
train代码:关键词
optimizer.module.zero_grad()
model.module.save_weights
loss = model(imgs, targets)
torch.sum(loss).backward()
optimizer.module.step()
for epoch
in range(opt.epochs):
for batch_i, (_, imgs, targets)
in enumerate(dataloader):
imgs = Variable(imgs.type(Tensor))
targets = Variable(targets.type(Tensor),
requires_grad=
False)
optimizer.module.zero_grad()
loss = model(imgs, targets)
# loss.backward()
# optimizer.step()
torch.sum(loss).backward()
optimizer.module.step()
now = datetime.datetime.now()
strftime = now.strftime(
"%H:%M:%S")
print(strftime, epoch, opt.epochs, batch_i,
len(dataloader), loss)
if batch_i %
40 ==
39:
if last_total_loss >
0 and total_loss > last_total_loss *
1.01:
print(
"total_loss", total_loss)
adjust_learning_rate(optimizer)
else:
print(
"total_loss", total_loss, last_total_loss)
last_total_loss = total_loss
total_loss = torch.sum(loss)
elif batch_i ==
0:
total_loss = torch.sum(loss)
else:
total_loss += torch.sum(loss)
# if epoch > 0 and batch_i == 0:
# if torch.sum(loss) > mean_loss / batch_size :
# print("mean_loss", mean_loss)
# adjust_learning_rate(optimizer)
# mean_loss = torch.sum(loss)
# else:
# mean_loss += torch.sum(loss)
# info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']}
# for tag, value in info.items():
# logger.scalar_summary(tag, value, epoch)
print(
'%s [Epoch %d/%d, Batch %d/%d Losse s: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' %
(strftime, epoch, opt.epochs, batch_i,
len(dataloader),
model.module.losses[
'x'], model.module.losses[
'y'], model.module.losses[
'w'],
model.module.losses[
'h'], model.module.losses[
'conf'], model.module.losses[
'cls'],
torch.sum(loss), model.module.losses[
'recall']))
if epoch % opt.checkpoint_interval ==
0:
model.module.save_weights(
'%s/%d.weights' % (opt.checkpoint_dir, epoch))
# -*- coding:utf-8 -*-
from __future__
import division
from models
import *
from utils.utils
import *
from utils.datasets
import *
from utils.parse_config
import *
from logger
import Logger
import os
import sys
import time
import datetime
import argparse
import torch
from torch.utils.data
import DataLoader
from torch.autograd
import Variable
import torch.optim
as optim
parser = argparse.ArgumentParser()
parser.add_argument(
'--epochs', type=int
, default=
2001, help=
'number of epochs')
parser.add_argument(
'--image_folder', type=str
, default=
'data/samples', help=
'path to dataset')
parser.add_argument(
'--batch_size', type=int
, default=
4, help=
'size of each image batch')
parser.add_argument(
'--learning_rate', type=float
, default=
0.01, help=
'learning_rate')
parser.add_argument(
'--train_dir', type=str
, default=
r'E:\team-CV\dataset\tiny_data\VOC2007/',help=
'train_dir')
parser.add_argument(
'--model_config_path', type=str
, default=
'config/yolov3_2cls.cfg', help=
'path to model config file')
parser.add_argument(
'--data_config_path', type=str
, default=
'config/coco.data', help=
'path to data config file')
parser.add_argument(
'--weights_path', type=str
, default=
'weights/yolov3.weights', help=
'path to weights file')
# parser.add_argument('--weights_path', type=str, default='checkpoints/40.weights', help='path to weights file')
parser.add_argument(
'--class_path', type=str
, default=
'data/coco_2cls.names', help=
'path to class label file')
parser.add_argument(
'--conf_thres', type=float
, default=
0.8, help=
'object confidence threshold')
parser.add_argument(
'--nms_thres', type=float
, default=
0.4, help=
'iou thresshold for non-maximum suppression')
parser.add_argument(
'--n_cpu', type=int
, default=
0, help=
'number of cpu threads to use during batch generation')
parser.add_argument(
'--img_size', type=int
, default=
416, help=
'size of each image dimension')
parser.add_argument(
'--checkpoint_interval', type=int
, default=
4, help=
'interval between saving model weights')
parser.add_argument(
'--checkpoint_dir', type=str
, default=
'checkpoints', help=
'directory where model checkpoints are saved')
opt = parser.parse_args()
print(opt)
os.makedirs(
'output', exist_ok=
True)
os.makedirs(
'checkpoints', exist_ok=
True)
def adjust_learning_rate(optimizer
, decay_rate=
0.5):
for param_group
in optimizer.module.param_groups:
if(param_group[
'lr']>
1e-8):
param_group[
'lr'] = param_group[
'lr'] * decay_rate
print(optimizer.module)
cuda =
True if torch.cuda.is_available
else False
classes = load_classes(opt.class_path)
module_defs=parse_model_config(opt.model_config_path)
hyperparams = module_defs[
0]
anchors=hyperparams[
"anchors"]
anchors = [int(x)
for x
in anchors.split(
",")]
anchors = [(anchors[i]
, anchors[i +
1])
for i
in range(
0, len(anchors)
, 2)]
module_defs[
83][
"anchors"]=anchors
module_defs[
95][
"anchors"]=anchors
module_defs[
107][
"anchors"]=anchors
batch_size = opt.batch_size
# int(hyperparams['batch'])
subdivisions = int(hyperparams[
'subdivisions'])
sub_batch = batch_size // subdivisions
learning_rate = opt.learning_rate
momentum = float(hyperparams[
'momentum'])
decay = float(hyperparams[
'decay'])
burn_in = int(hyperparams[
'burn_in'])
hyperparams[
'height']=hyperparams[
'width']=opt.img_size
if __name__ ==
'__main__':
dataloader = torch.utils.data.DataLoader(
ListDataset(opt.train_dir
,img_size=opt.img_size
,is_training =
1,data_size=
10000)
,
batch_size=batch_size
, shuffle=
1, num_workers=opt.n_cpu)
model = Darknet(module_defs
,img_size=opt.img_size)
model.load_weights(opt.weights_path
,is_training=
True)
#model.apply(weights_init_normal)
ngpus =
4
if ngpus >=
1:
device = torch.device(
"cuda")
else:
device = torch.device(
"cpu")
if cuda:
if ngpus >
1:
model = torch.nn.DataParallel(model).to(device)
# model = nn.parallel.DataParallel(model,device_ids=_DEVICE_ID).cuda()
else:
model = model.to(device)
model.train()
Tensor = torch.cuda.FloatTensor
if cuda
else torch.FloatTensor
# optimizer = optim.SGD(model.parameters(), lr=learning_rate/batch_size, momentum=momentum, dampening=0, weight_decay=decay*batch_size)
optimizer = optim.Adam(model.parameters()
, lr=learning_rate/batch_size
, weight_decay=decay*batch_size)
optimizer = torch.nn.DataParallel(optimizer).to(device)
print(
"subdivisions",subdivisions)
logger = Logger(
'./logs')
total_loss=
0
last_total_loss=
0
for epoch
in range(opt.epochs):
for batch_i
, (_
, imgs
, targets)
in enumerate(dataloader):
imgs = Variable(imgs.type(Tensor))
targets = Variable(targets.type(Tensor)
, requires_grad=
False)
optimizer.module.zero_grad()
loss = model(imgs
, targets)
# loss.backward()
# optimizer.step()
torch.sum(loss).backward()
optimizer.module.step()
strftime = datetime.datetime.now().strftime(
"%H:%M:%S")
# print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss)
if batch_i %
40 ==
39:
if last_total_loss >
0 and total_loss > last_total_loss *
1.01:
print(
"total_loss", total_loss)
adjust_learning_rate(optimizer)
else:
last_total_loss = total_loss
total_loss = torch.sum(loss)
elif batch_i ==
0:
total_loss = torch.sum(loss)
else:
total_loss += torch.sum(loss)
# if epoch > 0 and batch_i == 0:
# if torch.sum(loss) > mean_loss / batch_size :
# print("mean_loss", mean_loss)
# adjust_learning_rate(optimizer)
# mean_loss = torch.sum(loss)
# else:
# mean_loss += torch.sum(loss)
# info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']}
# for tag, value in info.items():
# logger.scalar_summary(tag, value, epoch)
print(
'%s [Epoch %d/%d, Batch %d/%d Losses: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' %
(strftime
, epoch
, opt.epochs
, batch_i
, len(dataloader)
,
model.module.losses[
'x']
, model.module.losses[
'y']
, model.module.losses[
'w']
,
model.module.losses[
'h']
, model.module.losses[
'conf']
, model.module.losses[
'cls']
,
torch.sum(loss)
, model.module.losses[
'recall']))
if epoch % opt.checkpoint_interval ==
0:
model.module.save_weights(
'%s/%d.weights' % (opt.checkpoint_dir
, epoch))