"""python 分別讀取train和valid的圖片和xml信息,創(chuàng)建用于訓(xùn)練和測試的json文件 """ defcreate_data_lists(voc07_path, voc12_path, output_folder): """ Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file. :param voc07_path: path to the 'VOC2007' folder :param voc12_path: path to the 'VOC2012' folder :param output_folder: folder where the JSONs must be saved """
# Training data for path in [voc07_path, voc12_path]:
# Find IDs of images in training data #獲取訓(xùn)練所用的train和val數(shù)據(jù)的圖片id with open(os.path.join(path, 'ImageSets/Main/trainval.txt')) as f: ids = f.read().splitlines()
#根據(jù)圖片id,解析圖片的xml文件,獲取標(biāo)注信息 for id in ids: # Parse annotation's XML file objects = parse_annotation(os.path.join(path, 'Annotations', id + '.xml')) if len(objects['boxes']) == 0: #如果沒有目標(biāo)則跳過 continue n_objects += len(objects) #統(tǒng)計目標(biāo)總數(shù) train_objects.append(objects) #存儲每張圖片的標(biāo)注信息到列表train_objects train_images.append(os.path.join(path, 'JPEGImages', id + '.jpg')) #存儲每張圖片的路徑到列表train_images,用于讀取圖片
# Save to file #將訓(xùn)練數(shù)據(jù)的圖片路徑,標(biāo)注信息,類別映射信息,分別保存為json文件 with open(os.path.join(output_folder, 'TRAIN_images.json'), 'w') as j: json.dump(train_images, j) with open(os.path.join(output_folder, 'TRAIN_objects.json'), 'w') as j: json.dump(train_objects, j) with open(os.path.join(output_folder, 'label_map.json'), 'w') as j: json.dump(label_map, j) # save label map too
print('\nThere are %d training images containing a total of %d objects. Files have been saved to %s.' % ( len(train_images), n_objects, os.path.abspath(output_folder)))
#與Train data一樣,目的是將測試數(shù)據(jù)的圖片路徑,標(biāo)注信息,類別映射信息,分別保存為json文件,參考上面的注釋理解 # Test data test_images = list() test_objects = list() n_objects = 0
# Find IDs of images in the test data with open(os.path.join(voc07_path, 'ImageSets/Main/test.txt')) as f: ids = f.read().splitlines()
for id in ids: # Parse annotation's XML file objects = parse_annotation(os.path.join(voc07_path, 'Annotations', id + '.xml')) if len(objects) == 0: continue test_objects.append(objects) n_objects += len(objects) test_images.append(os.path.join(voc07_path, 'JPEGImages', id + '.jpg'))
assert len(test_objects) == len(test_images)
# Save to file with open(os.path.join(output_folder, 'TEST_images.json'), 'w') as j: json.dump(test_images, j) with open(os.path.join(output_folder, 'TEST_objects.json'), 'w') as j: json.dump(test_objects, j)
print('\nThere are %d test images containing a total of %d objects. Files have been saved to %s.' % ( len(test_images), n_objects, os.path.abspath(output_folder)))
代碼位于 datasets.py 腳本中,可以看到,PascalVOCDataset繼承了torch.utils.data.Dataset,然后重寫了__init__ , getitem, len 和 collate_fn 四個方法,這也是我們在構(gòu)建自己的dataset的時候需要經(jīng)常做的工作,配合下面注釋理解代碼:
"""python PascalVOCDataset具體實現(xiàn)過程 """ import torch from torch.utils.data import Dataset import json import os from PIL import Image from utils import transform
classPascalVOCDataset(Dataset): """ A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches. """
#初始化相關(guān)變量 #讀取images和objects標(biāo)注信息 def__init__(self, data_folder, split, keep_difficult=False): """ :param data_folder: folder where data files are stored :param split: split, one of 'TRAIN' or 'TEST' :param keep_difficult: keep or discard objects that are considered difficult to detect? """ self.split = split.upper() #保證輸入為純大寫字母,便于匹配{'TRAIN', 'TEST'}
#我們知道,我們輸入到網(wǎng)絡(luò)中訓(xùn)練的數(shù)據(jù)通常是一個batch一起輸入,而通過__getitem__我們只讀取了一張圖片及其objects信息 #如何將讀取的一張張圖片及其object信息整合成batch的形式呢? #collate_fn就是做這個事情, #對于一個batch的images,collate_fn通過torch.stack()將其整合成4維tensor,對應(yīng)的objects信息分別用一個list存儲 defcollate_fn(self, batch): """ Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader). This describes how to combine these tensors of different sizes. We use lists. Note: this need not be defined in this Class, can be standalone. :param batch: an iterable of N sets from __getitem__() :return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties """
"""python transform操作是訓(xùn)練模型中一項非常重要的工作,其中不僅包含數(shù)據(jù)增強(qiáng)以提升模型性能的相關(guān)操作,也包含如數(shù)據(jù)類型轉(zhuǎn)換(PIL to Tensor)、歸一化(Normalize)這些必要操作。 """ import json import os import torch import random import xml.etree.ElementTree as ET import torchvision.transforms.functional as FT
在TRAIN和TEST時都要進(jìn)行的transform有: 1.統(tǒng)一圖像大小到(224,224),resize 2.PIL to Tensor 3.歸一化,F(xiàn)T.normalize()
注1: resize也是一種幾何變化,要知道應(yīng)用數(shù)據(jù)增強(qiáng)策略時,哪些屬于幾何變化,哪些屬于像素變化 注2: PIL to Tensor操作,normalize操作必須執(zhí)行 """
deftransform(image, boxes, labels, difficulties, split): """ Apply the transformations above. :param image: image, a PIL Image :param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) :param labels: labels of objects, a tensor of dimensions (n_objects) :param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects) :param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied :return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties """
#在訓(xùn)練和測試時使用的transform策略往往不完全相同,所以需要split變量指明是TRAIN還是TEST時的transform方法 assert split in {'TRAIN', 'TEST'}
# Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on # see: https://pytorch.org/docs/stable/torchvision/models.html #為了防止由于圖片之間像素差異過大而導(dǎo)致的訓(xùn)練不穩(wěn)定問題,圖片在送入網(wǎng)絡(luò)訓(xùn)練之間需要進(jìn)行歸一化 #對所有圖片各通道求mean和std來獲得 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225]
# Skip the following operations for evaluation/testing if split == 'TRAIN': # A series of photometric distortions in random order, each with 50% chance of occurrence, as in Caffe repo new_image = photometric_distort(new_image)
# Convert PIL image to Torch tensor new_image = FT.to_tensor(new_image)
# Expand image (zoom out) with a 50% chance - helpful for training detection of small objects # Fill surrounding space with the mean of ImageNet data that our base VGG was trained on if random.random() < 0.5: new_image, new_boxes = expand(new_image, boxes, filler=mean)
# Convert Torch tensor to PIL image new_image = FT.to_pil_image(new_image)
# Flip image with a 50% chance if random.random() < 0.5: new_image, new_boxes = flip(new_image, new_boxes)
# Resize image to (224, 224) - this also converts absolute boundary coordinates to their fractional form new_image, new_boxes = resize(new_image, new_boxes, dims=(224, 224))
# Convert PIL image to Torch tensor new_image = FT.to_tensor(new_image)
# Normalize by mean and standard deviation of ImageNet data that our base VGG was trained on new_image = FT.normalize(new_image, mean=mean, std=std)