摘要
目标检测竞赛的第一步就是需要一个全面的数据分析,需要分析长宽比,各个类别的比例,对json文件的解析有二种方式,一种是利用cocoAPI解析也很方便,另一种就是使用json文件特有的读取方式,以下代码全部是在jupyter notebook上使用,上一个单元的信息依旧保留在下一个单元,如果使用pycharm的话,把使用的代码段在重新复制一下。
数据分析
分析标签数量,每个类别对应的id
import json
with open('instances_val2017.json') as f:
a=json.load(f)
print('标签数量')
print('类别数量',len(a['categories']))
a['categories']
分析照片数量和标签数量
print('训练集图片数量:', len(a['images']))
print('训练集标签数量:', len(a['annotations']))
分析每种大小照片有多少数量,此时我的数据集照片大小各不相同
total=[]
for img in a['images']:
wh=(img['width'], img['height'])
total.append(wh)
unique=set(total)
for k in unique:
print('长宽为(%d.%d)的图片数量为:'%k,total.count(k))
分析照片有没有重复的,如果照片和unique的数量不对应说明有重复的
ids=[]
images_id=[]
for i in a['annotations']:
ids.append(i['id'])
images_id.append(i['image_id'])
print('训练集图片数量: ', len(ids))
print('unique id 数量: ', len(set(ids)))
print('unique image_id 数量: ', len(set(images_id)))
分析每一类别有多少数量
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family']='sans-serif'
plt.rcParams['figure.figsize'] = (10.0, 10.0)
category_dic = dict([(i['id'], i['name']) for i in a['categories']])
print(category_dic)
counts_label_all=dict([(i['name'], 0) for i in a['categories']])
for i in a['annotations']:
counts_label_all[category_dic[i['category_id']]] += 1
print("总数据集每个类别的标注数量: {}".format(counts_label_all))
分析每一类别的比例,可视化,由于是中文所以使用对应的id替代名字
plt.style.use({'figure.figsize':(10, 10)})
indexs=category_dic.keys()
values=counts_label_all.values()
Count_df=pd.DataFrame(list(values),index=indexs)
Count_df.plot(kind='pie',y=Count_df.columns)
分析一张照片有几个bbox,可能有的照片会有很多,比如1,就有1500张照片包含一个bbox,大于一的包含多个框。
plt.style.use({'figure.figsize':(15, 8)})#15是输出图像高,8是宽
annoto_count={}
for i in a['annotations']:
annoto_count[i['image_id']]=annoto_count.setdefault(i['image_id'],0)+1
indes_list=set(annoto_count.values())
values_count=[list(annoto_count.values()).count(i) for i in indes_list]
pd.DataFrame(values_count,index=indes_list,columns=['标签数量']).plot(kind='bar')
分析长宽比,用于mmdetection cascade系列修改
# 对所有标注长宽做统计
anntotations =[]
for i in a['annotations']:
an=i
anntotations.append(an)
print(anntotations[1])
data = []
per_sample = {}
for img in a['images']:
sample_img = img
annota_list = [] # 保存该图片对应的标签
for per in anntotations:
if per['image_id']==img['id']: # 将一张图的所有的ann装进annota_list中
annota_list.append(per)
for k in annota_list:
anntotations.remove(k) # 把拿出来的ann在总的标注文件中都删除掉
sample_img['annotations'] = annota_list
data.append(sample_img)
total_size=[]
total_height=[]
total_wh=[]
for im in data: # 每张图的信息
for b in im['annotations']: # 每张图的每个标注
# total_width += [b['bbox'][2]]
# total_height += [b['bbox'][3]]
wh = round(b['bbox'][2]/b['bbox'][3], 0)
if wh < 1 :
wh = round(b['bbox'][3]/b['bbox'][2],0)
total_wh += [wh]
# 所有标签的长宽高比例
box_wh_unique = list(set(total_wh))
box_wh_count=[total_wh.count(i) for i in box_wh_unique]
bbox_wh_dict = {}
for i, key in enumerate(box_wh_unique):
print('宽高比{}: 数量:{}'.format(key, box_wh_count[i]))
# 绘图
wh_df = pd.DataFrame(box_wh_count,index=box_wh_unique,columns=['宽高比数量'])
wh_df.plot(kind='bar',color="#55aacc")
plt.show()
对所有训练集生成一份带有框的图像,目的是预测的时候对比分析问题
import tqdm
import cv2
import json
for ann_img in a['images']:
img = cv2.imread('val2017/' + ann_img['file_name'])
img_id = ann_img['id']
for ann_ann in a['annotations']:
if ann_ann['image_id'] == img_id:
x1 = ann_ann['bbox'][0]
y1 = ann_ann['bbox'][1]
x2 = ann_ann['bbox'][0] + ann_ann['bbox'][2]
y2 = ann_ann['bbox'][1] + ann_ann['bbox'][3]
img = cv2.rectangle(img, (x1,y1), (x2,y2), (255,0,0), 8)
cv2.imwrite('./nn/' + ann_img['file_name'], img)
分析自己验证集和训练集的数据比例
import json
with open('instances_train2017.json') as f:
train=json.load(f)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family']='sans-serif'
plt.rcParams['figure.figsize'] = (10.0, 10.0)
category_dic = dict([(i['id'], i['name']) for i in train['categories']])
counts_label_trainall=dict([(i['name'], 0) for i in train['categories']])
for i in train['annotations']:
counts_label_trainall[category_dic[i['category_id']]] += 1
print("总数据集每个类别的标注数量: {}".format(counts_label_trainall))
这一步代码是我添加训练集的数据读取,之前的全部都是验证集
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
labels=[]
for i in category_dic:
labels.append(i)
print(counts_label_all)
val_means=[]
for i in counts_label_all:
nn=counts_label_all[i]
val_means.append(nn)
aug_train_means=[]
for i in counts_label_trainall:
nn=counts_label_trainall[i]
aug_train_means.append(nn)
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, val_means, width, label='train')
rects2 = ax.bar(x + width/2, aug_train_means, width, label='aug_train')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Annotation Number')
ax.set_title('The annotation number before and after data augmentation')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.show()
比例大致还可以,如果存在训练集和验证集比列失衡的话可以重新划分。
总结
这一篇博客讲解了常用的手段对数据分析。代码书写也比较方便。