updated dataset to do preprocessing only once

1c8d03d9 · Jakob · 5b15111f · 1c8d03d9 · 1c8d03d9
Commit 1c8d03d9 authored 2 years ago by Jakob
--- a/Yolo_train.py
+++ b/Yolo_train.py
@@ -68,7 +68,7 @@ if __name__ == '__main__':
    validation_losses = []

    print("creating train dataset and loader")
-    train_dataset = CocoDataSet("./data/train2014", "./data/labels/train2014", transform, 1) 
+    train_dataset = CocoDataSet("./data/train2014", "./data/labels/train2014", transform, 0.2) 
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                            shuffle=True, num_workers=0)


--- a/cocoDataset.py
+++ b/cocoDataset.py
 from torch.utils.data import Dataset
-import torch
+import os
 from pathlib import Path
 from skimage import io
 from skimage.color import gray2rgb
-from random import random
+from random import random, sample
 from utils import toMatrix
+import math

 from PIL import Image

@@ -17,53 +18,35 @@ class CocoDataSet(Dataset):
        self.transform = transform
        img_dir = f"{Path().resolve()}\{img_dir}"
        label_dir = f"{Path().resolve()}\{label_dir}"
+        self.person_id = 0
+
+        self.preprocess_data(img_dir, label_dir)

        self.person_images = 0
        self.non_person_images = 0

-        x = []
-        y = []

-        img_names = [file.stem for file in list(Path(img_dir).glob("*.jpg"))]
-        annotation_file_names = [file.stem for file in list(Path(label_dir).glob("*.txt"))]
-        # If both img and annotation file exists add to (x,y)
-        for img_name in list(set(img_names) & set(annotation_file_names)):
-            x.append(f"{img_dir}\{img_name}.jpg")
-            y.append(f"{label_dir}\{img_name}.txt")
+        self.x = [str(file) for file in list(Path(fr"{img_dir}\person").glob("*.jpg"))]
+        self.y = [str(file) for file in list(Path(fr"{label_dir}\person").glob("*.txt"))]

-        self.person_id = 0
+        # if 1.0 or above inluce all files
+        if percentage_without_person >= 1.0:
+            self.x.extend([str(file) for file in list(Path(fr"{img_dir}").glob("*.jpg"))])
+            self.y.extend([str(file) for file in list(Path(fr"{label_dir}").glob("*.txt"))])
        
-        self.x = []
-        self.y = []
+        # if not below or equal to 0.0 take random sample of files
+        elif percentage_without_person > 0.0:
+            x_without_person = [file.stem for file in list(Path(fr"{img_dir}").glob("*.jpg"))]
+            y_without_person = [file.stem for file in list(Path(fr"{label_dir}").glob("*.txt"))]

-        if percentage_without_person < 1.0:
-    
-            for i, image, annotation_file in zip(range(len(x)),x,y):
-                print(f"{i}/{len(x)}", end="\r")
-
-                labels = self.read_annotation_file(annotation_file)
-                # with open(label_file) as file:
-                #     labels = [line.strip(" \n").split(" ") for line in file.readlines()]
-
-                image_has_persons = self.person_id in [label[0] for label in labels]
-
-                # include if there is person in the image
-                if image_has_persons:
-                    self.x.append(image)
-                    self.y.append(annotation_file)
-                    # self.y.append(persons)
-                    self.person_images += 1
-                # include one element with prob. if no person is on the image
-                elif percentage_without_person > 0.0:
-                    if percentage_without_person > random():
-                        self.x.append(image)
-                        self.y.append(annotation_file)
-                        # self.y.append([]) # one element with label 0 as confidence
-                        self.non_person_images += 1
-        else:
-            self.x = x                    
-            self.y = y                    
+            images_with_annotations = list(set(x_without_person) & set(y_without_person))
+
+            images_without_person_to_add = sample(images_with_annotations, math.floor(random() * len(images_with_annotations)))

+            for file_name in images_without_person_to_add:
+                self.x.append(fr"{img_dir}\{file_name}.jpg")
+                self.y.append(fr"{label_dir}\{file_name}.txt")
+            

    def __len__(self):
        return len(self.x)
@@ -111,6 +94,39 @@ class CocoDataSet(Dataset):
        data = [[int(point[0]), *point[1:]] for point in data]
        return data

+    def preprocess_data(self, img_dir, label_dir):
+        if (os.path.isdir(fr"{img_dir}\person") and os.path.isdir(fr"{label_dir}\person")):
+            return
+        
+        if (not os.path.isdir(fr"{img_dir}\person")):
+            os.mkdir(fr"{img_dir}\person")
+        if (not os.path.isdir(fr"{label_dir}\person")):
+            os.mkdir(fr"{label_dir}\person")
+
+        img_names = [file.stem for file in list(Path(img_dir).glob("*.jpg"))]
+        annotation_file_names = [file.stem for file in list(Path(label_dir).glob("*.txt"))]
+
+        # If both img and annotation file exists
+        images_with_annotations = list(set(img_names) & set(annotation_file_names))
+        for i, img_name in enumerate(images_with_annotations):
+            image = f"{img_dir}\{img_name}.jpg"
+            annotation_file = f"{label_dir}\{img_name}.txt"
+
+            print(f"Preprocessing: {i}/{len(images_with_annotations)}", end="\r")
+
+            labels = self.read_annotation_file(annotation_file)
+
+            image_has_persons = self.person_id in [label[0] for label in labels]
+            # move files if there is person in the image
+            if image_has_persons:
+                new_img_file_name = fr"{img_dir}\person{image.replace(img_dir, '')}"
+                new_ann_file_name = fr"{label_dir}\person{annotation_file.replace(label_dir, '')}"
+                os.rename(annotation_file, new_ann_file_name)
+                os.rename(image, new_img_file_name)
+        print("Done Preprocessing!", end=f"{' '*20}\n")
+
+
+
 if __name__ == "__main__":
    from PIL import Image