Caffe2 - (三十二) Detectron 之 roi

xiaoxiao2021-02-28 29

Caffe2 - (三十二) Detectron 之 roi_data - 模型 minibatch blobs

根据对应的 roi_data 模块可以处理对应模型的 minibatch blobs.

fast_rcnn.pymask_rcnn.pykeypoint_rcnn.pyrpn.pyretinanet.py

1. fast_rcnn.py

构建用于 Fast R-CNN 训练的 minibatches.

""" 处理 Fast R-CNN 所涉及的 minibatch blobs. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import logging import numpy as np import numpy.random as npr from core.config import cfg import modeling.FPN as fpn import roi_data.keypoint_rcnn import roi_data.mask_rcnn import utils.blob as blob_utils import utils.boxes as box_utils logger = logging.getLogger(__name__) def get_fast_rcnn_blob_names(is_training=True): """ Fast R-CNN blob names. """ """ rois blob: R 个 RoIs(regions of interest)，每个 blob 是 5-tuple：(batch_idx, x1, y1, x2, y2)， - batch_idx：图片 batch index - (x1, y1, x2, y2)：矩形框 """ blob_names = ['rois'] if is_training: # labels_int32 blob: # R categorical labels in [0, ..., K] for K foreground classes plus background # K 个前景类 + 1 个背景类. blob_names += ['labels_int32'] if is_training: # bbox_targets blob: # R bounding-box regression targets with 4 targets per class blob_names += ['bbox_targets'] # bbox_inside_weights blob: # 每个 roi 最多 4 个 targets 被激活，该二值向量表示了激活 targets 的subset. blob_names += ['bbox_inside_weights'] blob_names += ['bbox_outside_weights'] if is_training and cfg.MODEL.MASK_ON: # 'mask_rois': # 训练 mask 预测分支所采样的 RoIs # Shape is (#masks, 5) in format (batch_idx, x1, y1, x2, y2). blob_names += ['mask_rois'] # 'roi_has_mask': # rois 中指定的 RoIs 的二值标签(binart labels)，表示每个 RoI 是否有 mask. # 注：某些情况， *bg* RoI 会有一个值都为 -1(ignore) 的 mask，此时，没有 fg RoIs 可采样. # Shape is (batchsize). blob_names += ['roi_has_mask_int32'] # 'masks_int32': # 'mask_rois' 中指定的 RoIs的二值masks. # Shape is (#fg, M * M) where M is the ground truth mask size. blob_names += ['masks_int32'] if is_training and cfg.MODEL.KEYPOINTS_ON: # 'keypoint_rois': # 训练 keypoint 预测分支所采样的 RoIs # Shape is (#instances, 5) in format (batch_idx, x1, y1, x2, y2). blob_names += ['keypoint_rois'] # 'keypoint_locations_int32': # KRCNN.HEATMAP_SIZE**2 大小的 array 中 keypoint 的索引index. # Shape is (#instances). Used in SoftmaxWithLoss. blob_names += ['keypoint_locations_int32'] # 'keypoint_weights': # 'keypoint_locations_int32' 中每个 target 的权重weight # Shape is (#instances). Used in SoftmaxWithLoss. blob_names += ['keypoint_weights'] # 'keypoint_loss_normalizer': # 可选参数，如果 cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS = False， # 使用归一化因子. blob_names += ['keypoint_loss_normalizer'] if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: """ 支持 FPN multi-level rois without bbox reg isn't implemented (... and may never be implemented) """ k_max = cfg.FPN.ROI_MAX_LEVEL k_min = cfg.FPN.ROI_MIN_LEVEL # Same format as rois blob, but one per FPN level for lvl in range(k_min, k_max + 1): blob_names += ['rois_fpn' + str(lvl)] blob_names += ['rois_idx_restore_int32'] if is_training: if cfg.MODEL.MASK_ON: for lvl in range(k_min, k_max + 1): blob_names += ['mask_rois_fpn' + str(lvl)] blob_names += ['mask_rois_idx_restore_int32'] if cfg.MODEL.KEYPOINTS_ON: for lvl in range(k_min, k_max + 1): blob_names += ['keypoint_rois_fpn' + str(lvl)] blob_names += ['keypoint_rois_idx_restore_int32'] return blob_names def add_fast_rcnn_blobs(blobs, im_scales, roidb): """ 添加 blobs ，用于训练 Fast R-CNN style models. """ # 从每张图片采样训练 RoIs，并添加到 blob 列表lists for im_i, entry in enumerate(roidb): frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i) for k, v in frcn_blobs.items(): blobs[k].append(v) # 将 blob lists 连接为 tensors for k, v in blobs.items(): if isinstance(v, list) and len(v) > 0: blobs[k] = np.concatenate(v) # 添加 FPN multilevel training RoIs, if configured if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: _add_multilevel_rois(blobs) # 在处理完所有的 minibatch 图片后，进行安全性检查. valid = True if cfg.MODEL.KEYPOINTS_ON: valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid) return valid def _sample_rois(roidb, im_scale, batch_idx): """ 生成由 foreground 和 background 样本组成的 RoIs 的随机采样. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) max_overlaps = roidb['max_overlaps'] # 选择 foreground RoIs，overlap >= FG_THRESH 的 fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # 避免出现的情况： # 图片中的 foreground RoIs 的数量小于 fg_rois_per_image fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # 无替换地(without replacement)采样 foreground 区域 if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # 选择 background RoIs， overlap 在 [BG_THRESH_LO, BG_THRESH_HI) 之间的 bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO) )[0] # 计算从图片中选择的 background RoIs 数量 # (避免数量太少) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # 无替换地(without replacement)采样 background 区域 if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # 所选择的 indices (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label 是与每个 RoI 具有最大 overlap 的类别class sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if 'bbox_targets' not in roidb: gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets(sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets(roidb['bbox_targets'][keep_inds, :]) bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # 缩放Scale rois，并格式化为： (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights ) # Optionally add Mask R-CNN blobs if cfg.MODEL.MASK_ON: roi_data.mask_rcnn.add_mask_rcnn_blobs( blob_dict, sampled_boxes, roidb, im_scale, batch_idx ) # Optionally add Keypoint R-CNN blobs if cfg.MODEL.KEYPOINTS_ON: roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs( blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx) return blob_dict def _compute_targets(ex_rois, gt_rois, labels): """ 计算图片的边界框回归目标值bounding-box regression targets. """ assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False ) def _expand_bbox_targets(bbox_target_data): """ 边界框回归目标值以紧凑形式存储在 roidb 中. 该函数将 targets 展开为网所使用的 4-of-4*K 表示. (i.e. 只有一个类别class 具有 non-zero targets). 类似地，loss weights 也进行展开. 返回值: bbox_target_data (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ num_bbox_reg_classes = cfg.MODEL.NUM_CLASSES if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: num_bbox_reg_classes = 2 # bg and fg clss = bbox_target_data[:, 0] bbox_targets = blob_utils.zeros((clss.size, 4 * num_bbox_reg_classes)) bbox_inside_weights = blob_utils.zeros(bbox_targets.shape) inds = np.where(clss > 0)[0] for ind in inds: cls = int(clss[ind]) start = 4 * cls end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = (1.0, 1.0, 1.0, 1.0) return bbox_targets, bbox_inside_weights def _add_multilevel_rois(blobs): """ 默认情况，只对单 feature map level 添加训练 RoIs. 当使用 FPN时，RoIs 必须根据 level 设置启发式来分配到不同的 FPN levels. (参见: modeling.FPN.map_rois_to_fpn_levels). """ lvl_min = cfg.FPN.ROI_MIN_LEVEL lvl_max = cfg.FPN.ROI_MAX_LEVEL def _distribute_rois_over_fpn_levels(rois_blob_name): """ 分配 rois 到不同的 FPN levels. """ # 获取每个 roi 的 target level # blob rois 格式为：(batch_idx, x1, y1, x2, y2), 因此，取1：5 列的 box 坐标 target_lvls = fpn.map_rois_to_fpn_levels(blobs[rois_blob_name][:, 1:5], lvl_min, lvl_max ) # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl> fpn.add_multilevel_roi_blobs(blobs, rois_blob_name, blobs[rois_blob_name], target_lvls, lvl_min, lvl_max) _distribute_rois_over_fpn_levels('rois') if cfg.MODEL.MASK_ON: _distribute_rois_over_fpn_levels('mask_rois') if cfg.MODEL.KEYPOINTS_ON: _distribute_rois_over_fpn_levels('keypoint_rois')

2. mask_rcnn.py

构建 Mask R-CNN 训练的 minibatches.

""" 处理 Mask R-CNN 的 minibatch blobs. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import logging import numpy as np from core.config import cfg import utils.blob as blob_utils import utils.boxes as box_utils import utils.segms as segm_utils logger = logging.getLogger(__name__) def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx): """ 添加 Mask R-CNN 特有的 blobs 到 input blob dictionary. """ """ 准备 mask targets：将一个 gt mask 关联到每个具有 fg 类别标签(non-bg class label)的训练 roi， """ M = cfg.MRCNN.RESOLUTION polys_gt_inds = np.where((roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0))[0] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) fg_inds = np.where(blobs['labels_int32'] > 0)[0] roi_has_mask = blobs['labels_int32'].copy() roi_has_mask[roi_has_mask > 0] = 1 if fg_inds.shape[0] > 0: # foreground rois 的类别标签 mask_class_labels = blobs['labels_int32'][fg_inds] masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True) # 寻找所有的 foreground rois 与边界框之间的重叠区域，封闭区域. rois_fg = sampled_boxes[fg_inds] overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( rois_fg.astype(np.float32, copy=False), boxes_from_polys.astype(np.float32, copy=False) ) # 将每个 fg rois 映射到 highest overlap 的mask. # (衡量标准： bbox overlap) fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) # 添加 fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] roi_fg = rois_fg[i] # 将给定 fg roi 中的多边形 mask 转换为 MxM 的二值图像. mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M) # 确保 mask 是二值的binary mask = np.array(mask > 0, dtype=np.int32) masks[i, :] = np.reshape(mask, M**2) else: # 如果没有 fg masks # 网络不能处理空 blobs，因此，需要提供一个 mask. # 简单采用第一个 bg roi，并给定其一个都是 -1(ignore label) 值的 mask, # 且其类别标签为 0 (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # rois_fg 实际上是一个 background roi, but that's ok because ... rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # 设定一个 -1's blob (ignore label) masks = -blob_utils.ones((1, M**2), int32=True) # 设定其类别标签 class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # 确保第一个 roi 有一个 mask roi_has_mask[0] = 1 if cfg.MRCNN.CLS_SPECIFIC_MASK: masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels) # 缩放Scale rois_fg，并格式化为： (batch_idx, x1, y1, x2, y2) rois_fg *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1)) rois_fg = np.hstack((repeated_batch_idx, rois_fg)) # Update blobs dict with Mask R-CNN blobs blobs['mask_rois'] = rois_fg blobs['roi_has_mask_int32'] = roi_has_mask blobs['masks_int32'] = masks def _expand_to_class_specific_mask_targets(masks, mask_class_labels): """ 将 masks 由 shape (#masks, M ** 2) 展开到 (#masks, #classes * M ** 2)，以表示类别已知的 mask targets. """ assert masks.shape[0] == mask_class_labels.shape[0] M = cfg.MRCNN.RESOLUTION # Target values of -1 are "don't care" / ignore labels mask_targets = -blob_utils.ones((masks.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True ) for i in range(masks.shape[0]): cls = int(mask_class_labels[i]) start = M**2 * cls end = start + M**2 # 忽略 background 实例instance # (只有图片中没有 fg 样本是才会发生) if cls > 0: mask_targets[i, start:end] = masks[i, :] return mask_targets

3. keypoint_rcnn.py

构建 Mask R-CNN 关于 keypoints 训练的 minibatches.

""" 处理 Mask R-CNN 中关于 keypoint 检测分支训练的 minibatch blobs. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import logging import numpy as np from core.config import cfg import utils.blob as blob_utils import utils.keypoints as keypoint_utils logger = logging.getLogger(__name__) def add_keypoint_rcnn_blobs(blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx): """ 添加 Mask R-CNN keypoint 相关的 blobs 到给定的 blobs dictionary. """ """ 注： gt_inds 必须与 datasets.json_dataset._merge_proposal_boxes_into_roidb 中的计算一致. """ gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where(np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) )[0] kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice(kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False ) sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones((len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype ) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints, sampled_fg_rois ) shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1) ) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights def finalize_keypoint_minibatch(blobs, valid): """ 当所有的 minibatch 图片 blobs 处理完以后，定型 minibatch. """ min_count = cfg.KRCNN.MIN_KEYPOINT_COUNT_FOR_VALID_MINIBATCH num_visible_keypoints = np.sum(blobs['keypoint_weights']) valid = (valid and len(blobs['keypoint_weights']) > 0 and num_visible_keypoints > min_count ) # Normalizer to use if cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False. # See modeling.model_builder.add_keypoint_losses norm = num_visible_keypoints / ( cfg.TRAIN.IMS_PER_BATCH * cfg.TRAIN.BATCH_SIZE_PER_IM * cfg.TRAIN.FG_FRACTION * cfg.KRCNN.NUM_KEYPOINTS ) blobs['keypoint_loss_normalizer'] = np.array(norm, dtype=np.float32) return valid def _within_box(points, boxes): """ 确认在给定 box 中的 keypoints. points: Nx2xK boxes: Nx4 output: NxK """ x_within = np.logical_and( points[:, 0, :] >= np.expand_dims(boxes[:, 0], axis=1), points[:, 0, :] <= np.expand_dims(boxes[:, 2], axis=1) ) y_within = np.logical_and( points[:, 1, :] >= np.expand_dims(boxes[:, 1], axis=1), points[:, 1, :] <= np.expand_dims(boxes[:, 3], axis=1) ) return np.logical_and(x_within, y_within)

4. rpn.py

""" RPN - Region Proposal Networks 构建 minibatch. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import logging import numpy as np import numpy.random as npr from core.config import cfg import roi_data.data_utils as data_utils import utils.blob as blob_utils import utils.boxes as box_utils logger = logging.getLogger(__name__) def get_rpn_blob_names(is_training=True): """ RPN 使用的 Blob names. """ # im_info: (height, width, image scale) blob_names = ['im_info'] if is_training: # gt boxes: (batch_idx, x1, y1, x2, y2, cls) blob_names += ['roidb'] if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: # 与 RPN blobs 格式一致, but one per FPN level for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1): blob_names += ['rpn_labels_int32_wide_fpn' + str(lvl), 'rpn_bbox_targets_wide_fpn' + str(lvl), 'rpn_bbox_inside_weights_wide_fpn' + str(lvl), 'rpn_bbox_outside_weights_wide_fpn' + str(lvl) ] else: # Single level RPN blobs blob_names += ['rpn_labels_int32_wide', 'rpn_bbox_targets_wide', 'rpn_bbox_inside_weights_wide', 'rpn_bbox_outside_weights_wide' ] return blob_names def add_rpn_blobs(blobs, im_scales, roidb): """ 添加 RPN-only 和 end-to-end Faster R-CNN 模型训练所需的 blobs. """ if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: # RPN applied to many feature levels, as in the FPN paper k_max = cfg.FPN.RPN_MAX_LEVEL k_min = cfg.FPN.RPN_MIN_LEVEL foas = [] for lvl in range(k_min, k_max + 1): field_stride = 2.**lvl anchor_sizes = (cfg.FPN.RPN_ANCHOR_START_SIZE * 2.**(lvl - k_min), ) anchor_aspect_ratios = cfg.FPN.RPN_ASPECT_RATIOS foa = data_utils.get_field_of_anchors(field_stride, anchor_sizes, anchor_aspect_ratios) foas.append(foa) all_anchors = np.concatenate([f.field_of_anchors for f in foas]) else: foa = data_utils.get_field_of_anchors(cfg.RPN.STRIDE, cfg.RPN.SIZES, cfg.RPN.ASPECT_RATIOS ) all_anchors = foa.field_of_anchors for im_i, entry in enumerate(roidb): scale = im_scales[im_i] im_height = np.round(entry['height'] * scale) im_width = np.round(entry['width'] * scale) gt_inds = np.where( (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0) )[0] gt_rois = entry['boxes'][gt_inds, :] * scale # 待办事项(rbg): gt_boxes is poorly named; # should be something like 'gt_rois_info' gt_boxes = blob_utils.zeros((len(gt_inds), 6)) gt_boxes[:, 0] = im_i # batch inds gt_boxes[:, 1:5] = gt_rois gt_boxes[:, 5] = entry['gt_classes'][gt_inds] im_info = np.array([[im_height, im_width, scale]], dtype=np.float32) blobs['im_info'].append(im_info) # 添加 RPN targets if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: # RPN applied to many feature levels, as in the FPN paper rpn_blobs = _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_rois) for i, lvl in enumerate(range(k_min, k_max + 1)): for k, v in rpn_blobs[i].items(): blobs[k + '_fpn' + str(lvl)].append(v) else: # 经典 RPN, 对单 feature level 应用. rpn_blobs = _get_rpn_blobs(im_height, im_width, [foa], all_anchors, gt_rois) for k, v in rpn_blobs.items(): blobs[k].append(v) for k, v in blobs.items(): if isinstance(v, list) and len(v) > 0: blobs[k] = np.concatenate(v) valid_keys = ['has_visible_keypoints', 'boxes', 'segms', 'seg_areas', 'gt_classes', 'gt_overlaps', 'is_crowd', 'box_to_gt_ind_map', 'gt_keypoints' ] minimal_roidb = [{} for _ in range(len(roidb))] for i, e in enumerate(roidb): for k in valid_keys: if k in e: minimal_roidb[i][k] = e[k] blobs['roidb'] = blob_utils.serialize(minimal_roidb) # Always return valid=True, since RPN minibatches are valid by design return True def _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_boxes): total_anchors = all_anchors.shape[0] straddle_thresh = cfg.TRAIN.RPN_STRADDLE_THRESH if straddle_thresh >= 0: # 只保留在图片内的 anchors，根据阈值 straddle_thresh # 设置 TRAIN.RPN_STRADDLE_THRESH = -1 (或一个很大的值) 以保留所有的 anchors. inds_inside = np.where((all_anchors[:, 0] >= -straddle_thresh) & (all_anchors[:, 1] >= -straddle_thresh) & (all_anchors[:, 2] < im_width + straddle_thresh) & (all_anchors[:, 3] < im_height + straddle_thresh) )[0] # keep only inside anchors anchors = all_anchors[inds_inside, :] else: inds_inside = np.arange(all_anchors.shape[0]) anchors = all_anchors num_inside = len(inds_inside) logger.debug('total_anchors: {}'.format(total_anchors)) logger.debug('inds_inside: {}'.format(num_inside)) logger.debug('anchors.shape: {}'.format(anchors.shape)) # 计算 anchor labels: # label=1 is positive, 0 is negative, -1 is don't care (ignore) labels = np.empty((num_inside, ), dtype=np.int32) labels.fill(-1) if len(gt_boxes) > 0: # 计算 anchors 与 gt boxes 重叠区域间的 overlaps anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes) # 映射 anchor 到具有 highest overlap 的 gt box anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) # 对于每个 anchor, 与最重叠的 gt box 的 overlap 数量 anchor_to_gt_max = anchor_by_gt_overlap[np.arange(num_inside), anchor_to_gt_argmax] # 将 gt box映射到具有 highest overlap 的 anchor gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) #对于每个 gt box, 与最重叠的 anchor 的 overlap 数量 gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1]) ] # 寻找共享 max overlap 数量的所有 anchors # (this includes many ties) anchors_with_max_overlap = np.where(anchor_by_gt_overlap == gt_to_anchor_max)[0] # Fg label: for each gt use anchors with highest overlap # (including ties) labels[anchors_with_max_overlap] = 1 # Fg label: 大于 IOU 阈值 labels[anchor_to_gt_max >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 # 如果有很多 positive labels，则随机采样 num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCH_SIZE_PER_IM) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1 fg_inds = np.where(labels == 1)[0] # 如果有很多 negative labels，则随机采样 # (samples with replacement, but since the set of bg inds is large most # samples will not have repeats) num_bg = cfg.TRAIN.RPN_BATCH_SIZE_PER_IM - np.sum(labels == 1) bg_inds = np.where(anchor_to_gt_max < cfg.TRAIN.RPN_NEGATIVE_OVERLAP)[0] if len(bg_inds) > num_bg: enable_inds = bg_inds[npr.randint(len(bg_inds), size=num_bg)] labels[enable_inds] = 0 bg_inds = np.where(labels == 0)[0] bbox_targets = np.zeros((num_inside, 4), dtype=np.float32) bbox_targets[fg_inds, :] = data_utils.compute_targets(anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :] ) """ Bbox regression loss 的形式: loss(x) = weight_outside * L(weight_inside * x) Inside weights 可以在 element-wist basis 上设为 0. bbox regression 只对 positive 样本进行训练，因此可以设置其权重为 1.0，否则设为 0.0 Inside weights 相当于 "开关". """ bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32) bbox_inside_weights[labels == 1, :] = (1.0, 1.0, 1.0, 1.0) """ bbox regression loss 只根据 minibatch 内的图片数进行取平均. 根据所选取的 anchors 样本总数进行取平均. Outside weights 用于对每个 loss 逐元素缩放(scale each element-wise loss), 因此，最终的对 minibatch 求平均是正确的. Outside weights 相当于 "权重". """ bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32) # uniform weighting of examples (given non-uniform sampling) num_examples = np.sum(labels >= 0) bbox_outside_weights[labels == 1, :] = 1.0 / num_examples bbox_outside_weights[labels == 0, :] = 1.0 / num_examples # Map up to original set of anchors labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1) bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0) bbox_inside_weights = data_utils.unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) bbox_outside_weights = data_utils.unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) # 对生成的 labels, etc. 分割为 labels per each field of anchors blobs_out = [] start_idx = 0 for foa in foas: H = foa.field_size W = foa.field_size A = foa.num_cell_anchors end_idx = start_idx + H * W * A _labels = labels[start_idx:end_idx] _bbox_targets = bbox_targets[start_idx:end_idx, :] _bbox_inside_weights = bbox_inside_weights[start_idx:end_idx, :] _bbox_outside_weights = bbox_outside_weights[start_idx:end_idx, :] start_idx = end_idx # 输出 labels 的 shape (1, A, height, width) _labels = _labels.reshape((1, H, W, A)).transpose(0, 3, 1, 2) # bbox_targets 输出的 shape (1, 4 * A, height, width) _bbox_targets = _bbox_targets.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2) # bbox_inside_weights 输出的 shape (1, 4 * A, height, width) _bbox_inside_weights = _bbox_inside_weights.reshape((1, H, W, A * 4)).transpose(0, 3, 1, 2) # bbox_outside_weights 输出的 shape (1, 4 * A, height, width) _bbox_outside_weights = _bbox_outside_weights.reshape( (1, H, W, A * 4)).transpose(0, 3, 1, 2) blobs_out.append(dict(rpn_labels_int32_wide=_labels, rpn_bbox_targets_wide=_bbox_targets, rpn_bbox_inside_weights_wide=_bbox_inside_weights, rpn_bbox_outside_weights_wide=_bbox_outside_weights) ) return blobs_out[0] if len(blobs_out) == 1 else blobs_out

5. retinanet.py

""" 计算训练 RetinaNet 网络的 minibatch blobs. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import numpy as np import logging import utils.boxes as box_utils import roi_data.data_utils as data_utils from core.config import cfg logger = logging.getLogger(__name__) def get_retinanet_blob_names(is_training=True): """ 返回 blob names，以 data loader 读取的顺序. N = number of images per minibatch A = number of anchors = num_scales * num_aspect_ratios (for example 9 used in RetinaNet paper) H, W = spatial dimensions (different for each FPN level) M = Out of all the anchors generated, 取决于 positive/negative IoU overlap thresholds, 会得到 M 个 positive anchors. 这些是 bounding box 网络分支来回归的 anchors. retnet_cls_labels -> labels for the cls branch for each FPN level Shape: N x A x H x W retnet_roi_bbox_targets -> targets for the bbox regression branch Shape: M x 4 retnet_roi_fg_bbox_locs -> bbox 回归时，由于只对 fg bboxes 进行回归，且，网络的预测输出的shape 是 N x (A * 4) x H x W , 因此，将 positive boxes 的位置存储在 retnet_roi_fg_bbox_locs blobs, 其shape 为 M x 4，每一行的元素为：[img_id, anchor_id, x_loc, y_loc] """ # im_info: (height, width, image scale) blob_names = ['im_info'] assert cfg.FPN.FPN_ON, "RetinaNet uses FPN for dense detection" # Same format as RPN blobs, but one per FPN level if is_training: blob_names += ['retnet_fg_num', 'retnet_bg_num'] for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1): suffix = 'fpn{}'.format(lvl) blob_names += ['retnet_cls_labels_' + suffix, 'retnet_roi_bbox_targets_' + suffix, 'retnet_roi_fg_bbox_locs_' + suffix, ] return blob_names def add_retinanet_blobs(blobs, im_scales, roidb, image_width, image_height): """ 添加 RetinaNet blobs. """ # RetinaNet is applied to many feature levels, as in the FPN paper k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE num_aspect_ratios = len(cfg.RETINANET.ASPECT_RATIOS) aspect_ratios = cfg.RETINANET.ASPECT_RATIOS anchor_scale = cfg.RETINANET.ANCHOR_SCALE # get anchors from all levels for all scales/aspect ratios foas = [] for lvl in range(k_min, k_max + 1): stride = 2. ** lvl for octave in range(scales_per_octave): octave_scale = 2 ** (octave / float(scales_per_octave)) for idx in range(num_aspect_ratios): anchor_sizes = (stride * octave_scale * anchor_scale, ) anchor_aspect_ratios = (aspect_ratios[idx], ) foa = data_utils.get_field_of_anchors( stride, anchor_sizes, anchor_aspect_ratios, octave, idx) foas.append(foa) all_anchors = np.concatenate([f.field_of_anchors for f in foas]) blobs['retnet_fg_num'], blobs['retnet_bg_num'] = 0.0, 0.0 for im_i, entry in enumerate(roidb): scale = im_scales[im_i] im_height = np.round(entry['height'] * scale) im_width = np.round(entry['width'] * scale) gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] assert len(gt_inds) > 0, 'Empty ground truth empty for image is not allowed. Please check.' gt_rois = entry['boxes'][gt_inds, :] * scale gt_classes = entry['gt_classes'][gt_inds] im_info = np.array([[im_height, im_width, scale]], dtype=np.float32) blobs['im_info'].append(im_info) retinanet_blobs, fg_num, bg_num = _get_retinanet_blobs( foas, all_anchors, gt_rois, gt_classes, image_width, image_height) for i, foa in enumerate(foas): for k, v in retinanet_blobs[i].items(): # the way it stacks is: # [[anchors for image1] + [anchors for images 2]] level = int(np.log2(foa.stride)) key = '{}_fpn{}'.format(k, level) if k == 'retnet_roi_fg_bbox_locs': v[:, 0] = im_i # loc_stride: 80 * 4 if cls_specific else 4 loc_stride = 4 # 4 coordinate corresponding to bbox prediction if cfg.RETINANET.CLASS_SPECIFIC_BBOX: loc_stride *= (cfg.MODEL.NUM_CLASSES - 1) anchor_ind = foa.octave * num_aspect_ratios + foa.aspect # v[:, 1] is the class label [range 0-80] if we do # class-specfic bbox otherwise it is 0. In case of class # specific, based on the label, the location of current # anchor is class_label * 4 and then we take into account # the anchor_ind if the anchors v[:, 1] *= 4 v[:, 1] += loc_stride * anchor_ind blobs[key].append(v) blobs['retnet_fg_num'] += fg_num blobs['retnet_bg_num'] += bg_num blobs['retnet_fg_num'] = blobs['retnet_fg_num'].astype(np.float32) blobs['retnet_bg_num'] = blobs['retnet_bg_num'].astype(np.float32) N = len(roidb) for k, v in blobs.items(): if isinstance(v, list) and len(v) > 0: # compute number of anchors A = int(len(v) / N) # for the cls branch labels [per fpn level], # we have blobs['retnet_cls_labels_fpn{}'] as a list until this step # and length of this list is N x A where # N = num_images, A = num_anchors for example, N = 2, A = 9 # Each element of the list has the shape 1 x 1 x H x W where H, W are # spatial dimension of curret fpn lvl. Let a{i} denote the element # corresponding to anchor i [9 anchors total] in the list. # The elements in the list are in order [[a0, ..., a9], [a0, ..., a9]] # however the network will make predictions like 2 x (9 * 80) x H x W # so we first concatenate the elements of each image to a numpy array # and then concatenate the two images to get the 2 x 9 x H x W if k.find('retnet_cls_labels') >= 0: tmp = [] # concat anchors within an image for i in range(0, len(v), A): tmp.append(np.concatenate(v[i: i + A], axis=1)) # concat images blobs[k] = np.concatenate(tmp, axis=0) else: # for the bbox branch elements [per FPN level], # we have the targets and the fg boxes locations # in the shape: M x 4 where M is the number of fg locations in a # given image at the current FPN level. For the given level, # the bbox predictions will be. The elements in the list are in # order [[a0, ..., a9], [a0, ..., a9]] # Concatenate them to form M x 4 blobs[k] = np.concatenate(v, axis=0) return True def _get_retinanet_blobs( foas, all_anchors, gt_boxes, gt_classes, im_width, im_height): total_anchors = all_anchors.shape[0] logger.debug('Getting mad blobs: im_height {} im_width: {}'.format( im_height, im_width)) inds_inside = np.arange(all_anchors.shape[0]) anchors = all_anchors num_inside = len(inds_inside) logger.debug('total_anchors: {}'.format(total_anchors)) logger.debug('inds_inside: {}'.format(num_inside)) logger.debug('anchors.shape: {}'.format(anchors.shape)) # Compute anchor labels: # label=1 is positive, 0 is negative, -1 is don't care (ignore) labels = np.empty((num_inside, ), dtype=np.float32) labels.fill(-1) if len(gt_boxes) > 0: # Compute overlaps between the anchors and the gt boxes overlaps anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes) # Map from anchor to gt box that has highest overlap anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) # For each anchor, amount of overlap with most overlapping gt box anchor_to_gt_max = anchor_by_gt_overlap[ np.arange(num_inside), anchor_to_gt_argmax] # Map from gt box to an anchor that has highest overlap gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) # For each gt box, amount of overlap with most overlapping anchor gt_to_anchor_max = anchor_by_gt_overlap[ gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])] # Find all anchors that share the max overlap amount # (this includes many ties) anchors_with_max_overlap = np.where( anchor_by_gt_overlap == gt_to_anchor_max)[0] # Fg label: for each gt use anchors with highest overlap # (including ties) gt_inds = anchor_to_gt_argmax[anchors_with_max_overlap] labels[anchors_with_max_overlap] = gt_classes[gt_inds] # Fg label: above threshold IOU inds = anchor_to_gt_max >= cfg.RETINANET.POSITIVE_OVERLAP gt_inds = anchor_to_gt_argmax[inds] labels[inds] = gt_classes[gt_inds] fg_inds = np.where(labels >= 1)[0] bg_inds = np.where(anchor_to_gt_max < cfg.RETINANET.NEGATIVE_OVERLAP)[0] labels[bg_inds] = 0 num_fg, num_bg = len(fg_inds), len(bg_inds) bbox_targets = np.zeros((num_inside, 4), dtype=np.float32) bbox_targets[fg_inds, :] = data_utils.compute_targets( anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :]) # Map up to original set of anchors labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1) bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0) # Split the generated labels, etc. into labels per each field of anchors blobs_out = [] start_idx = 0 for foa in foas: H = foa.field_size W = foa.field_size end_idx = start_idx + H * W _labels = labels[start_idx:end_idx] _bbox_targets = bbox_targets[start_idx:end_idx, :] start_idx = end_idx # labels output with shape (1, height, width) _labels = _labels.reshape((1, 1, H, W)) # bbox_targets output with shape (1, 4 * A, height, width) _bbox_targets = _bbox_targets.reshape((1, H, W, 4)).transpose(0, 3, 1, 2) stride = foa.stride w = int(im_width / stride) h = int(im_height / stride) # data for select_smooth_l1 loss num_classes = cfg.MODEL.NUM_CLASSES - 1 inds_4d = np.where(_labels > 0) M = len(inds_4d) _roi_bbox_targets = np.zeros((0, 4)) _roi_fg_bbox_locs = np.zeros((0, 4)) if M > 0: im_inds, y, x = inds_4d[0], inds_4d[2], inds_4d[3] _roi_bbox_targets = np.zeros((len(im_inds), 4)) _roi_fg_bbox_locs = np.zeros((len(im_inds), 4)) lbls = _labels[im_inds, :, y, x] for i, lbl in enumerate(lbls): l = lbl[0] - 1 if not cfg.RETINANET.CLASS_SPECIFIC_BBOX: l = 0 assert l >= 0 and l < num_classes, 'label out of the range' _roi_bbox_targets[i, :] = _bbox_targets[:, :, y[i], x[i]] _roi_fg_bbox_locs[i, :] = np.array([[0, l, y[i], x[i]]]) blobs_out.append(dict(retnet_cls_labels=_labels[:, :, 0:h, 0:w].astype(np.int32), retnet_roi_bbox_targets=_roi_bbox_targets.astype(np.float32), retnet_roi_fg_bbox_locs=_roi_fg_bbox_locs.astype(np.float32), ) ) out_num_fg = np.array([num_fg + 1.0], dtype=np.float32) out_num_bg = (np.array([num_bg + 1.0]) * (cfg.MODEL.NUM_CLASSES - 1) + out_num_fg * (cfg.MODEL.NUM_CLASSES - 2)) return blobs_out, out_num_fg, out_num_bg

转载请注明原文地址: https://www.6miu.com/read-2619659.html

技术

最新回复(0)