概述
由于cpu版本速度太慢,真正应用实际环境中仅仅只能通过cuda或者opencl实现,所以我将仅仅介绍cuda版本
detection_output_layer层的输入可以参考Caffe框架下SSD算法源码综述。它通过hpp,cpp和cu实现。参看DetectionOutputLayer::Forward_gpu(),前向传播通过decodeBBoxesGPU函数将预测得到的检测框进行解码操作通过PermuteDataGPU函数重新reshape一下类别的预测值,在处理之前,deploy.prototxt可以看出已经将conf当做dtection_output_layer的输入之前已经做了sofxmax。所以我们不需要在detection_output_layer中进行softmax然后通过上述的两个结果进行处理:将不同的类别应用极大抑制算法(类别间的极大抑制算法是相互独立的)最终将处理后的数据放入输出层中源码还有存储结果操作,不是必要项,所以不解析
源码解析
和常规的layer层一样,detection_output_layer函数主要Forward和Backward组成,但没有实现Backward。
前向传播使用到的函数有:
DecodeBBoxesGPU函数PermuteDataGPU函数ApplyNMSFast函数
Forward_gpu
template
<typename Dtype
>
void DetectionOutputLayer
<Dtype
>::Forward_gpu(
const vector
<Blob
<Dtype
>*>& bottom
, const vector
<Blob
<Dtype
>*>& top
) {
const Dtype
* loc_data
= bottom
[0]->gpu_data();
const Dtype
* prior_data
= bottom
[2]->gpu_data();
const int num
= bottom
[0]->num();
Dtype
* bbox_data
= bbox_preds_
.mutable_gpu_data();
const int loc_count
= bbox_preds_
.count();
const bool clip_bbox
= false
;
DecodeBBoxesGPU
<Dtype
>(loc_count
, loc_data
, prior_data
, code_type_
,
variance_encoded_in_target_
, num_priors_
, share_location_
,
num_loc_classes_
, background_label_id_
, clip_bbox
, bbox_data
);
const Dtype
* bbox_cpu_data
;
if (!share_location_
) {
Dtype
* bbox_permute_data
= bbox_permute_
.mutable_gpu_data();
PermuteDataGPU
<Dtype
>(loc_count
, bbox_data
, num_loc_classes_
, num_priors_
,
4, bbox_permute_data
);
bbox_cpu_data
= bbox_permute_
.cpu_data();
} else {
bbox_cpu_data
= bbox_preds_
.cpu_data();
}
Dtype
* conf_permute_data
= conf_permute_
.mutable_gpu_data();
PermuteDataGPU
<Dtype
>(bottom
[1]->count(), bottom
[1]->gpu_data(),
num_classes_
, num_priors_
, 1, conf_permute_data
);
const Dtype
* conf_cpu_data
= conf_permute_
.cpu_data();
int num_kept
= 0;
vector
<map
<int, vector
<int> > > all_indices
;
for (int i
= 0; i
< num
; ++i
) {
map
<int, vector
<int> > indices
;
int num_det
= 0;
const int conf_idx
= i
* num_classes_
* num_priors_
;
int bbox_idx
;
if (share_location_
) {
bbox_idx
= i
* num_priors_
* 4;
} else {
bbox_idx
= conf_idx
* 4;
}
for (int c
= 0; c
< num_classes_
; ++c
) {
if (c
== background_label_id_
) {
continue;
}
const Dtype
* cur_conf_data
= conf_cpu_data
+ conf_idx
+ c
* num_priors_
;
const Dtype
* cur_bbox_data
= bbox_cpu_data
+ bbox_idx
;
if (!share_location_
) {
cur_bbox_data
+= c
* num_priors_
* 4;
}
ApplyNMSFast(cur_bbox_data
, cur_conf_data
, num_priors_
,
confidence_threshold_
, nms_threshold_
, eta_
, top_k_
, &(indices
[c
]));
num_det
+= indices
[c
].size();
}
if (keep_top_k_
> -1 && num_det
> keep_top_k_
) {
vector
<pair
<float, pair
<int, int> > > score_index_pairs
;
for (map
<int, vector
<int> >::iterator it
= indices
.begin();
it
!= indices
.end(); ++it
) {
int label
= it
->first
;
const vector
<int>& label_indices
= it
->second
;
for (int j
= 0; j
< label_indices
.size(); ++j
) {
int idx
= label_indices
[j
];
float score
= conf_cpu_data
[conf_idx
+ label
* num_priors_
+ idx
];
score_index_pairs
.push_back(std
::make_pair(
score
, std
::make_pair(label
, idx
)));
}
}
std
::sort(score_index_pairs
.begin(), score_index_pairs
.end(),
SortScorePairDescend
<pair
<int, int> >);
score_index_pairs
.resize(keep_top_k_
);
map
<int, vector
<int> > new_indices
;
for (int j
= 0; j
< score_index_pairs
.size(); ++j
) {
int label
= score_index_pairs
[j
].second
.first
;
int idx
= score_index_pairs
[j
].second
.second
;
new_indices
[label
].push_back(idx
);
}
all_indices
.push_back(new_indices
);
num_kept
+= keep_top_k_
;
} else {
all_indices
.push_back(indices
);
num_kept
+= num_det
;
}
}
vector
<int> top_shape(2, 1);
top_shape
.push_back(num_kept
);
top_shape
.push_back(7);
Dtype
* top_data
;
if (num_kept
== 0) {
LOG(INFO
) << "Couldn't find any detections";
top_shape
[2] = num
;
top
[0]->Reshape(top_shape
);
top_data
= top
[0]->mutable_cpu_data();
caffe_set
<Dtype
>(top
[0]->count(), -1, top_data
);
for (int i
= 0; i
< num
; ++i
) {
top_data
[0] = i
;
top_data
+= 7;
}
} else {
top
[0]->Reshape(top_shape
);
top_data
= top
[0]->mutable_cpu_data();
}
int count
= 0;
for (int i
= 0; i
< num
; ++i
) {
const int conf_idx
= i
* num_classes_
* num_priors_
;
int bbox_idx
;
if (share_location_
) {
bbox_idx
= i
* num_priors_
* 4;
} else {
bbox_idx
= conf_idx
* 4;
}
for (map
<int, vector
<int> >::iterator it
= all_indices
[i
].begin();
it
!= all_indices
[i
].end(); ++it
) {
int label
= it
->first
;
vector
<int>& indices
= it
->second
;
const Dtype
* cur_conf_data
=
conf_cpu_data
+ conf_idx
+ label
* num_priors_
;
const Dtype
* cur_bbox_data
= bbox_cpu_data
+ bbox_idx
;
if (!share_location_
) {
cur_bbox_data
+= label
* num_priors_
* 4;
}
for (int j
= 0; j
< indices
.size(); ++j
) {
int idx
= indices
[j
];
top_data
[count
* 7] = i
;
top_data
[count
* 7 + 1] = label
;
top_data
[count
* 7 + 2] = cur_conf_data
[idx
];
for (int k
= 0; k
< 4; ++k
) {
top_data
[count
* 7 + 3 + k
] = cur_bbox_data
[idx
* 4 + k
];
}
++count
;
}
}
}
}
DecodeBBoxesGPU实现:
DecodeBBoxesGpu使用到了DecodeBBoxesKernel核函数
template
<typename Dtype
>
void DecodeBBoxesGPU(const int nthreads
,
const Dtype
* loc_data
, const Dtype
* prior_data
,
const CodeType code_type
, const bool variance_encoded_in_target
,
const int num_priors
, const bool share_location
,
const int num_loc_classes
, const int background_label_id
,
const bool clip_bbox
, Dtype
* bbox_data
) {
DecodeBBoxesKernel
<Dtype
><<<CAFFE_GET_BLOCKS(nthreads
),
CAFFE_CUDA_NUM_THREADS
>>>(nthreads
, loc_data
, prior_data
, code_type
,
variance_encoded_in_target
, num_priors
, share_location
, num_loc_classes
,
background_label_id
, clip_bbox
, bbox_data
);
CUDA_POST_KERNEL_CHECK
;
}
DecoeBBoxesKernel核函数实现:
template
<typename Dtype
>
__global__
void DecodeBBoxesKernel(const int nthreads
,
const Dtype
* loc_data
, const Dtype
* prior_data
,
const CodeType code_type
, const bool variance_encoded_in_target
,
const int num_priors
, const bool share_location
,
const int num_loc_classes
, const int background_label_id
,
const bool clip_bbox
, Dtype
* bbox_data
) {
CUDA_KERNEL_LOOP(index
, nthreads
) {
const int i
= index
% 4;
const int c
= (index
/ 4) % num_loc_classes
;
const int d
= (index
/ 4 / num_loc_classes
) % num_priors
;
if (!share_location
&& c
== background_label_id
) {
return;
}
const int pi
= d
* 4;
const int vi
= pi
+ num_priors
* 4;
if (code_type
== PriorBoxParameter_CodeType_CORNER
) {
if (variance_encoded_in_target
) {
bbox_data
[index
] = prior_data
[pi
+ i
] + loc_data
[index
];
} else {
bbox_data
[index
] =
prior_data
[pi
+ i
] + loc_data
[index
] * prior_data
[vi
+ i
];
}
} else if (code_type
== PriorBoxParameter_CodeType_CENTER_SIZE
) {
const Dtype p_xmin
= prior_data
[pi
];
const Dtype p_ymin
= prior_data
[pi
+ 1];
const Dtype p_xmax
= prior_data
[pi
+ 2];
const Dtype p_ymax
= prior_data
[pi
+ 3];
const Dtype prior_width
= p_xmax
- p_xmin
;
const Dtype prior_height
= p_ymax
- p_ymin
;
const Dtype prior_center_x
= (p_xmin
+ p_xmax
) / 2.;
const Dtype prior_center_y
= (p_ymin
+ p_ymax
) / 2.;
const Dtype xmin
= loc_data
[index
- i
];
const Dtype ymin
= loc_data
[index
- i
+ 1];
const Dtype xmax
= loc_data
[index
- i
+ 2];
const Dtype ymax
= loc_data
[index
- i
+ 3];
Dtype decode_bbox_center_x
, decode_bbox_center_y
;
Dtype decode_bbox_width
, decode_bbox_height
;
if (variance_encoded_in_target
) {
decode_bbox_center_x
= xmin
* prior_width
+ prior_center_x
;
decode_bbox_center_y
= ymin
* prior_height
+ prior_center_y
;
decode_bbox_width
= exp(xmax
) * prior_width
;
decode_bbox_height
= exp(ymax
) * prior_height
;
} else {
decode_bbox_center_x
=
prior_data
[vi
] * xmin
* prior_width
+ prior_center_x
;
decode_bbox_center_y
=
prior_data
[vi
+ 1] * ymin
* prior_height
+ prior_center_y
;
decode_bbox_width
=
exp(prior_data
[vi
+ 2] * xmax
) * prior_width
;
decode_bbox_height
=
exp(prior_data
[vi
+ 3] * ymax
) * prior_height
;
}
switch (i
) {
case 0:
bbox_data
[index
] = decode_bbox_center_x
- decode_bbox_width
/ 2.;
break;
case 1:
bbox_data
[index
] = decode_bbox_center_y
- decode_bbox_height
/ 2.;
break;
case 2:
bbox_data
[index
] = decode_bbox_center_x
+ decode_bbox_width
/ 2.;
break;
case 3:
bbox_data
[index
] = decode_bbox_center_y
+ decode_bbox_height
/ 2.;
break;
}
} else if (code_type
== PriorBoxParameter_CodeType_CORNER_SIZE
) {
const Dtype p_xmin
= prior_data
[pi
];
const Dtype p_ymin
= prior_data
[pi
+ 1];
const Dtype p_xmax
= prior_data
[pi
+ 2];
const Dtype p_ymax
= prior_data
[pi
+ 3];
const Dtype prior_width
= p_xmax
- p_xmin
;
const Dtype prior_height
= p_ymax
- p_ymin
;
Dtype p_size
;
if (i
== 0 || i
== 2) {
p_size
= prior_width
;
} else {
p_size
= prior_height
;
}
if (variance_encoded_in_target
) {
bbox_data
[index
] = prior_data
[pi
+ i
] + loc_data
[index
] * p_size
;
} else {
bbox_data
[index
] =
prior_data
[pi
+ i
] + loc_data
[index
] * prior_data
[vi
+ i
] * p_size
;
}
} else {
}
if (clip_bbox
) {
bbox_data
[index
] = max(min(bbox_data
[index
], Dtype(1.)), Dtype(0.));
}
}
}
PermuteDataGPU实现
DecodeBBoxesGpu使用到了PermuteDataKernel核函数
template
<typename Dtype
>
void PermuteDataGPU(const int nthreads
,
const Dtype
* data
, const int num_classes
, const int num_data
,
const int num_dim
, Dtype
* new_data
) {
PermuteDataKernel
<Dtype
><<<CAFFE_GET_BLOCKS(nthreads
),
CAFFE_CUDA_NUM_THREADS
>>>(nthreads
, data
, num_classes
, num_data
,
num_dim
, new_data
);
CUDA_POST_KERNEL_CHECK
;
}
PermuteDataKernel实现
template
<typename Dtype
>
__global__
void PermuteDataKernel(const int nthreads
,
const Dtype
* data
, const int num_classes
, const int num_data
,
const int num_dim
, Dtype
* new_data
) {
CUDA_KERNEL_LOOP(index
, nthreads
) {
const int i
= index
% num_dim
;
const int c
= (index
/ num_dim
) % num_classes
;
const int d
= (index
/ num_dim
/ num_classes
) % num_data
;
const int n
= index
/ num_dim
/ num_classes
/ num_data
;
const int new_index
= ((n
* num_classes
+ c
) * num_data
+ d
) * num_dim
+ i
;
new_data
[new_index
] = data
[index
];
}
}
ApplyNMSFast实现
GetMaxScoreIndex使用到了GetMaxScoreIndex函数
template
<typename Dtype
>
void ApplyNMSFast(const Dtype
* bboxes
, const Dtype
* scores
, const int num
,
const float score_threshold
, const float nms_threshold
,
const float eta
, const int top_k
, vector
<int>* indices
) {
vector
<pair
<Dtype
, int> > score_index_vec
;
GetMaxScoreIndex(scores
, num
, score_threshold
, top_k
, &score_index_vec
);
float adaptive_threshold
= nms_threshold
;
indices
->clear();
while (score_index_vec
.size() != 0) {
const int idx
= score_index_vec
.front().second
;
bool keep
= true
;
for (int k
= 0; k
< indices
->size(); ++k
) {
if (keep
) {
const int kept_idx
= (*indices
)[k
];
float overlap
= JaccardOverlap(bboxes
+ idx
* 4, bboxes
+ kept_idx
* 4);
keep
= overlap
<= adaptive_threshold
;
} else {
break;
}
}
if (keep
) {
indices
->push_back(idx
);
}
score_index_vec
.erase(score_index_vec
.begin());
if (keep
&& eta
< 1 && adaptive_threshold
> 0.5) {
adaptive_threshold
*= eta
;
}
}
}
GetMaxScoreIndex实现
template
<typename Dtype
>
void GetMaxScoreIndex(const Dtype
* scores
, const int num
, const float threshold
,
const int top_k
, vector
<pair
<Dtype
, int> >* score_index_vec
) {
for (int i
= 0; i
< num
; ++i
) {
if (scores
[i
] > threshold
) {
score_index_vec
->push_back(std
::make_pair(scores
[i
], i
));
}
}
std
::sort(score_index_vec
->begin(), score_index_vec
->end(),
SortScorePairDescend
<int>);
if (top_k
> -1 && top_k
< score_index_vec
->size()) {
score_index_vec
->resize(top_k
);
}
}
后记
竟然使用了一下午的时间注释源码。