From c882af412c45faa9870f26a1f5fb0e65797d9dbc Mon Sep 17 00:00:00 2001
From: Illia Oleksiienko <io@ece.au.dk>
Date: Tue, 18 Oct 2022 22:58:47 +0000
Subject: [PATCH] Add tapp, uncertainty outputs

---
 run/eval_tracking_3d.py                       | 225 +++++++++++++
 run/train_3d.py                               |  72 +++-
 src/opendr/engine/target.py                   | 307 +++++++++++++++++-
 .../configs/tanet/car/tapp_16.proto           | 173 ++++++++++
 .../configs/tanet/car/vnn_tapp_16.proto       | 173 ++++++++++
 .../second_detector/data/kitti_common.py      |  14 +-
 .../pytorch/core/box_coders.py                |   4 +
 .../pytorch/core/box_torch_ops.py             |  84 +++++
 .../second_detector/pytorch/models/tanet.py   |  18 +-
 .../pytorch/models/voxelnet.py                | 209 ++++++++++--
 .../second_detector/run.py                    |  28 +-
 .../voxel_object_detection_3d_learner.py      |  68 +++-
 12 files changed, 1312 insertions(+), 63 deletions(-)
 create mode 100644 run/eval_tracking_3d.py
 create mode 100644 src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/tapp_16.proto
 create mode 100644 src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/vnn_tapp_16.proto

diff --git a/run/eval_tracking_3d.py b/run/eval_tracking_3d.py
new file mode 100644
index 0000000000..3275444c00
--- /dev/null
+++ b/run/eval_tracking_3d.py
@@ -0,0 +1,225 @@
+import sys
+import os
+import torch
+import tqdm
+import fire
+from opendr.engine.target import BoundingBox3D
+from opendr.engine.datasets import PointCloudsDatasetIterator
+from opendr.perception.object_detection_3d import VoxelObjectDetection3DLearner
+from opendr.perception.object_detection_3d import (
+    LabeledPointCloudsDatasetIterator,
+)
+from opendr.perception.object_tracking_3d.datasets.kitti_tracking import LabeledTrackingPointCloudsDatasetIterator
+
+config_roots = {
+    "pointpillars": os.path.join(
+        ".",
+        "src",
+        "opendr",
+        "perception",
+        "object_detection_3d",
+        "voxel_object_detection_3d",
+        "second_detector",
+        "configs",
+        "pointpillars",
+        "car",
+    ),
+    "tanet": os.path.join(
+        ".",
+        "src",
+        "opendr",
+        "perception",
+        "object_detection_3d",
+        "voxel_object_detection_3d",
+        "second_detector",
+        "configs",
+        "tanet",
+        "car",
+    ),
+}
+temp_dir = "./run/models"
+subsets_path = os.path.join(
+    ".",
+    "src",
+    "opendr",
+    "perception",
+    "object_detection_3d",
+    "datasets",
+    "kitti_subsets",
+)
+
+dataset_tracking_path = "/data/sets/kitti_tracking"
+datasets = {}
+
+all_track_ids = ["0000"]
+
+for track_id in all_track_ids:
+    datasets[track_id] = LabeledTrackingPointCloudsDatasetIterator(
+        dataset_tracking_path + "/training/velodyne/" + track_id,
+        dataset_tracking_path + "/training/label_02/" + track_id + ".txt",
+        dataset_tracking_path + "/training/calib/" + track_id + ".txt",
+    )
+
+
+def save_detection_inference(
+    model_type,
+    device="cuda:0",
+    name="pointpillars_car",
+    samples_list=[1],
+    config="xyres_16.proto",
+    eval_suffix="classic",
+):
+
+    config = os.path.join(config_roots[model_type], config,)
+    model_path = os.path.join(temp_dir, name)
+
+    learner = VoxelObjectDetection3DLearner(
+        model_config_path=config, device=device, checkpoint_after_iter=1000, return_uncertainty=True
+    )
+    learner.load(model_path)
+
+    results = {}
+    for samples in samples_list:
+        print("samples =", samples)
+
+        for track_id in all_track_ids:
+
+            dataset = datasets[track_id]
+            
+            os.makedirs(os.path.join(
+                model_path, "tracking_inference_detections", eval_suffix, "samples_" + str(samples)
+            ), exist_ok=True)
+
+            with open(
+                os.path.join(
+                    model_path,
+                    "tracking_inference_detections",
+                    eval_suffix,
+                    "samples_" + str(samples),
+                    track_id + ".txt"
+                ), "w"
+            ) as f:
+                for frame, (input, _) in tqdm.tqdm(enumerate(dataset), total=len(dataset)):
+                    output = learner.infer(input, samples=samples)
+
+                    result = "\n".join(box.to_kitti_tracking_string(frame) for box in output)
+
+                    if len(output) > 0:
+                        result += "\n"
+
+                    f.write(result)
+
+
+
+def test_model(
+    model_type,
+    device="cuda:0",
+    name="pointpillars_car",
+    samples_list=[1],
+    config="xyres_16.proto",
+    eval_suffix="classic",
+):
+
+    return save_detection_inference(
+        model_type=model_type,
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+    # config = os.path.join(config_roots[model_type], config,)
+    # model_path = os.path.join(temp_dir, name)
+
+    # learner = VoxelObjectDetection3DLearner(
+    #     model_config_path=config, device=device, checkpoint_after_iter=1000
+    # )
+    # learner.load(model_path)
+
+
+
+def test_pointpillars(
+    device="cuda:0",
+    name="pointpillars_car",
+    samples_list=[1],
+    config="xyres_16.proto",
+    eval_suffix="classic",
+):
+    return test_model(
+        "pointpillars",
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+
+def test_tanet(
+    device="cuda:0",
+    name="tanet_car",
+    samples_list=[1],
+    config="xyres_16.proto",
+    eval_suffix="classic",
+):
+    return test_model(
+        "tanet",
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+
+def test_vnn_pointpillars(
+    device="cuda:0",
+    name="vnn_pointpillars_car",
+    samples_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+    config="vnn_xyres_16.proto",
+    eval_suffix="vnn",
+):
+    return test_pointpillars(
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+def test_vnna_pointpillars(
+    device="cuda:0",
+    name="vnna_pointpillars_car",
+    samples_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+    config="vnna_xyres_16.proto",
+    eval_suffix="vnn",
+):
+    return test_pointpillars(
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+
+def test_vnn_tanet(
+    device="cuda:0",
+    name="vnn_tanet_car",
+    samples_list=[1, 2, 3, 4],
+    config="vnn_xyres_16.proto",
+    eval_suffix="vnn",
+):
+    return test_model(
+        "tanet",
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire()
diff --git a/run/train_3d.py b/run/train_3d.py
index 0d5c4f5b82..4be41991e3 100644
--- a/run/train_3d.py
+++ b/run/train_3d.py
@@ -64,6 +64,7 @@ def save_model(
         device=device,
         checkpoint_after_iter=1000,
         checkpoint_load_iter=load,
+        return_uncertainty=True
     )
 
     learner.save(model_path)
@@ -95,6 +96,7 @@ def train_model(
             device=device,
             checkpoint_after_iter=1000,
             checkpoint_load_iter=load,
+            return_uncertainty=True
         )
 
         learner.fit(
@@ -137,6 +139,23 @@ def train_tanet(
     )
 
 
+def train_tapp(
+    device="cuda:0",
+    load=0,
+    name="tapp_car",
+    config="tapp_16.proto",
+    samples_list=None,
+):
+    return train_model(
+        "tanet",
+        device=device,
+        load=load,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+    )
+
+
 def train_vnn_pointpillars(
     device="cuda:0",
     load=0,
@@ -178,6 +197,23 @@ def train_vnn_tanet(
     )
 
 
+def train_vnn_tapp(
+    device="cuda:0",
+    load=0,
+    name="vnn_tapp_car",
+    config="vnn_tapp_16.proto",
+    samples_list=[2, 1],
+):
+    return train_model(
+        "tanet",
+        device=device,
+        load=load,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+    )
+
+
 def test_model(
     model_type,
     device="cuda:0",
@@ -191,7 +227,7 @@ def test_model(
     model_path = os.path.join(temp_dir, name)
 
     learner = VoxelObjectDetection3DLearner(
-        model_config_path=config, device=device, checkpoint_after_iter=1000
+        model_config_path=config, device=device, checkpoint_after_iter=1000, return_uncertainty=True
     )
     learner.load(model_path)
 
@@ -246,6 +282,23 @@ def test_tanet(
     )
 
 
+def test_tapp(
+    device="cuda:0",
+    name="tapp_car",
+    samples_list=[1],
+    config="tapp_16.proto",
+    eval_suffix="classic",
+):
+    return test_model(
+        "tanet",
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+
 def test_vnn_pointpillars(
     device="cuda:0",
     name="vnn_pointpillars_car",
@@ -294,5 +347,22 @@ def test_vnn_tanet(
     )
 
 
+def test_vnn_tapp(
+    device="cuda:0",
+    name="vnn_tapp_car",
+    samples_list=[1, 2, 3, 4],
+    config="vnn_tapp_16.proto",
+    eval_suffix="vnn",
+):
+    return test_model(
+        "tanet",
+        device=device,
+        name=name,
+        samples_list=samples_list,
+        config=config,
+        eval_suffix=eval_suffix,
+    )
+
+
 if __name__ == "__main__":
     fire.Fire()
diff --git a/src/opendr/engine/target.py b/src/opendr/engine/target.py
index 6b26fca636..95b1937ccb 100644
--- a/src/opendr/engine/target.py
+++ b/src/opendr/engine/target.py
@@ -675,18 +675,18 @@ class BoundingBox3D(Target):
         return result
 
     def to_kitti_tracking_string(self, frame):
-        result = " ".join([
+        result = " ".join(map(str, [
             frame,
             self.name,
-            self.truncated,
-            self.occluded,
-            self.alpha,
-            *self.bbox2d,  # x y w h
+            float(self.truncated),
+            float(self.occluded),
+            float(self.alpha),
+            *([0,0,0,0] if self.bbox2d is None else self.bbox2d),  # x y w h
             *self.dimensions,  # w h l 
             *self.location,  # x y z
             self.rotation_y,
-            self.score,
-        ])
+            float(self.confidence),
+        ]))
 
         return result
 
@@ -1111,3 +1111,296 @@ class Heatmap(Target):
         :rtype: str
         """
         return str(self.data)
+
+
+
+class UncertaintyBoundingBox3D(BoundingBox3D):
+    """
+    This target is used for 3D Object Detection and Tracking with uncertainty.
+    A bounding box is described by its location (x, y, z), dimensions (w, h, d) and rotation (along vertical y axis).
+    Additional fields are used to describe confidence (score), 2D projection of the box on camera image (bbox2d),
+    truncation (truncated) and occlusion (occluded) levels, the name of an object (name) and
+    observation angle of an object (alpha).
+    UncertaintyBoundingBox3D provides uncertainty values for all regressed fields (this excludes name).
+    """
+
+    def __init__(
+            self,
+            name,
+            truncated,
+            occluded,
+            alpha,
+            bbox2d,
+            dimensions,
+            location,
+            rotation_y,
+            variance_truncated,
+            variance_occluded,
+            variance_alpha,
+            variance_bbox2d,
+            variance_dimensions,
+            variance_location,
+            variance_rotation_y,
+            score=0,
+            variance_score=0,
+    ):
+        super().__init__(
+            name,
+            truncated,
+            occluded,
+            alpha,
+            bbox2d,
+            dimensions,
+            location,
+            rotation_y,
+            score=0,
+        )
+        self.data.update({
+            "variance_truncated": variance_truncated,
+            "variance_occluded": variance_occluded,
+            "variance_alpha": variance_alpha,
+            "variance_bbox2d": variance_bbox2d,
+            "variance_dimensions": variance_dimensions,
+            "variance_location": variance_location,
+            "variance_rotation_y": variance_rotation_y,
+        })
+        self.confidence = score
+        self.variance_confidence = variance_score
+
+    def kitti(self):
+        result = super().kitti()
+
+        result["variance_truncated"] = np.array([self.data["variance_truncated"]])
+        result["variance_occluded"] = np.array([self.data["variance_occluded"]])
+        result["variance_alpha"] = np.array([self.data["variance_alpha"]])
+        result["variance_bbox"] = np.array([self.data["variance_bbox2d"]])
+        result["variance_dimensions"] = np.array([self.data["variance_dimensions"]])
+        result["variance_location"] = np.array([self.data["variance_location"]])
+        result["variance_rotation_y"] = np.array([self.data["variance_rotation_y"]])
+        result["variance_score"] = np.array([self.variance_confidence])
+
+        return result
+
+    def to_kitti_tracking_string(self, frame):
+
+        result = super().to_kitti_tracking_string(frame)
+
+        result += " ".join(map(str, [
+            float(self.variance_truncated),
+            float(self.variance_occluded),
+            float(self.variance_alpha),
+            *([0,0,0,0] if self.variance_bbox2d is None else self.variance_bbox2d),  # x y w h
+            *self.variance_dimensions,  # w h l 
+            *self.variance_location,  # x y z
+            self.variance_rotation_y,
+            float(self.variance_confidence),
+        ]))
+
+        return result
+
+    @property
+    def name(self):
+        return self.data["name"]
+
+    @property
+    def truncated(self):
+        return self.data["truncated"]
+
+    @property
+    def occluded(self):
+        return self.data["occluded"]
+
+    @property
+    def alpha(self):
+        return self.data["alpha"]
+
+    @property
+    def bbox2d(self):
+        return self.data["bbox2d"]
+
+    @property
+    def dimensions(self):
+        return self.data["dimensions"]
+
+    @property
+    def location(self):
+        return self.data["location"]
+
+    @property
+    def rotation_y(self):
+        return self.data["rotation_y"]
+
+    @property
+    def variance_truncated(self):
+        return self.data["variance_truncated"]
+
+    @property
+    def variance_occluded(self):
+        return self.data["variance_occluded"]
+
+    @property
+    def variance_alpha(self):
+        return self.data["variance_alpha"]
+
+    @property
+    def variance_bbox2d(self):
+        return self.data["variance_bbox2d"]
+
+    @property
+    def variance_dimensions(self):
+        return self.data["variance_dimensions"]
+
+    @property
+    def variance_location(self):
+        return self.data["variance_location"]
+
+    @property
+    def variance_rotation_y(self):
+        return self.data["variance_rotation_y"]
+
+    def __repr__(self):
+        return "UncertaintyBoundingBox3D " + str(self)
+
+    def __str__(self):
+        return str(self.kitti())
+
+
+class UncertaintyBoundingBox3DList(Target):
+    """
+    This target is used for 3D Object Detection with uncertainty. It contains a list of UncertaintyBoundingBox3D targets.
+    A bounding box is described by its location (x, y, z), dimensions (l, h, w) and rotation (along vertical (y) axis).
+    Additional fields are used to describe confidence (score), 2D projection of the box on camera image (bbox2d),
+    truncation (truncated) and occlusion (occluded) levels, the name of an object (name) and
+    observation angle of an object (alpha).
+    UncertaintyBoundingBox3D provides uncertainty values for all regressed fields (this excludes name).
+    """
+
+    def __init__(
+            self,
+            bounding_boxes_3d
+    ):
+        super().__init__()
+        self.data = bounding_boxes_3d
+        self.confidence = None if len(self.data) == 0 else np.mean([box.confidence for box in self.data])
+        self.variance_confidence = None if len(self.data) == 0 else np.mean([box.variance_confidence for box in self.data])
+
+    @staticmethod
+    def from_kitti(boxes_kitti):
+
+        count = len(boxes_kitti["name"])
+
+        boxes3d = []
+
+        for i in range(count):
+            box3d = UncertaintyBoundingBox3D(
+                boxes_kitti["name"][i],
+                boxes_kitti["truncated"][i],
+                boxes_kitti["occluded"][i],
+                boxes_kitti["alpha"][i],
+                boxes_kitti["bbox"][i],
+                boxes_kitti["dimensions"][i],
+                boxes_kitti["location"][i],
+                boxes_kitti["rotation_y"][i],
+                boxes_kitti["variance_truncated"][i],
+                boxes_kitti["variance_occluded"][i],
+                boxes_kitti["variance_alpha"][i],
+                boxes_kitti["variance_bbox"][i],
+                boxes_kitti["variance_dimensions"][i],
+                boxes_kitti["variance_location"][i],
+                boxes_kitti["variance_rotation_y"][i],
+                boxes_kitti["score"][i],
+                boxes_kitti["variance_score"][i],
+            )
+
+            boxes3d.append(box3d)
+
+        return UncertaintyBoundingBox3DList(boxes3d)
+
+    def kitti(self):
+
+        result = {
+            "name": [],
+            "truncated": [],
+            "occluded": [],
+            "alpha": [],
+            "bbox": [],
+            "dimensions": [],
+            "location": [],
+            "rotation_y": [],
+            "variance_truncated": [],
+            "variance_occluded": [],
+            "variance_alpha": [],
+            "variance_bbox": [],
+            "variance_dimensions": [],
+            "variance_location": [],
+            "variance_rotation_y": [],
+            "score": [],
+            "variance_score": [],
+        }
+
+        if len(self.data) == 0:
+            return result
+        elif len(self.data) == 1:
+            return self.data[0].kitti()
+        else:
+
+            for box in self.data:
+                result["name"].append(box.data["name"])
+                result["truncated"].append(box.data["truncated"])
+                result["occluded"].append(box.data["occluded"])
+                result["alpha"].append(box.data["alpha"])
+                result["bbox"].append(box.data["bbox2d"])
+                result["dimensions"].append(box.data["dimensions"])
+                result["location"].append(box.data["location"])
+                result["rotation_y"].append(box.data["rotation_y"])
+                result["variance_truncated"].append(box.data["variance_truncated"])
+                result["variance_occluded"].append(box.data["variance_occluded"])
+                result["variance_alpha"].append(box.data["variance_alpha"])
+                result["variance_bbox"].append(box.data["variance_bbox2d"])
+                result["variance_dimensions"].append(box.data["variance_dimensions"])
+                result["variance_location"].append(box.data["variance_location"])
+                result["variance_rotation_y"].append(box.data["variance_rotation_y"])
+                result["score"].append(box.confidence)
+                result["variance_score"].append(box.variance_confidence)
+
+            result["name"] = np.array(result["name"])
+            result["truncated"] = np.array(result["truncated"])
+            result["occluded"] = np.array(result["occluded"])
+            result["alpha"] = np.array(result["alpha"])
+            result["bbox"] = np.array(result["bbox"])
+            result["dimensions"] = np.array(result["dimensions"])
+            result["location"] = np.array(result["location"])
+            result["rotation_y"] = np.array(result["rotation_y"])
+            result["variance_truncated"] = np.array(result["truncated"])
+            result["variance_occluded"] = np.array(result["occluded"])
+            result["variance_alpha"] = np.array(result["alpha"])
+            result["variance_bbox"] = np.array(result["bbox"])
+            result["variance_dimensions"] = np.array(result["dimensions"])
+            result["variance_location"] = np.array(result["location"])
+            result["variance_rotation_y"] = np.array(result["rotation_y"])
+            result["score"] = np.array(result["score"])
+
+            num_ground_truths = len(result["name"])
+            num_objects = len([x for x in result["name"] if x != "DontCare"])
+            index = list(range(num_objects)) + [-1] * (num_ground_truths - num_objects)
+            result["index"] = np.array(index, dtype=np.int32)
+            result["group_ids"] = np.arange(num_ground_truths, dtype=np.int32)
+
+        return result
+
+    @property
+    def boxes(self):
+        return self.data
+
+    def __getitem__(self, idx):
+        return self.boxes[idx]
+
+    def __len__(self):
+        return len(self.data)
+
+    def __repr__(self):
+        return "UncertaintyBoundingBox3DList " + str(self)
+
+    def __str__(self):
+        return str(self.kitti())
+
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/tapp_16.proto b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/tapp_16.proto
new file mode 100644
index 0000000000..c947358ccb
--- /dev/null
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/tapp_16.proto
@@ -0,0 +1,173 @@
+model: {
+  second: {
+    voxel_generator {
+      point_cloud_range : [0, -39.68, -3, 69.12, 39.68, 1]
+      voxel_size : [0.16, 0.16, 4]
+      max_number_of_points_per_voxel : 100
+    }
+    num_class: 1
+    voxel_feature_extractor: {
+      module_class_name: "PillarFeature_TANet"
+      num_filters: [64]
+      with_distance: false
+    }
+    middle_feature_extractor: {
+      module_class_name: "PointPillarsScatter"
+    }
+    rpn: {
+      module_class_name: "RPN"
+      layer_nums: [3, 5, 5]
+      layer_strides: [2, 2, 2]
+      num_filters: [64, 128, 256]
+      upsample_strides: [1, 2, 4]
+      num_upsample_filters: [128, 128, 128]
+      use_groupnorm: false
+      num_groups: 32
+    }
+    loss: {
+      classification_loss: {
+        weighted_sigmoid_focal: {
+          alpha: 0.25
+          gamma: 2.0
+          anchorwise_output: true
+        }
+      }
+      localization_loss: {
+        weighted_smooth_l1: {
+          sigma: 3.0
+          code_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        }
+      }
+      classification_weight: 1.0
+      localization_weight: 2.0
+    }
+    # Outputs
+    use_sigmoid_score: true
+    encode_background_as_zeros: true
+    encode_rad_error_by_sin: true
+
+    use_direction_classifier: true
+    direction_loss_weight: 0.2
+    use_aux_classifier: false
+    # Loss
+    pos_class_weight: 1.0
+    neg_class_weight: 1.0
+
+    loss_norm_type: NormByNumPositives
+    # Postprocess
+    post_center_limit_range: [0, -39.68, -5, 69.12, 39.68, 5]
+    use_rotate_nms: false
+    use_multi_class_nms: false
+    nms_pre_max_size: 1000
+    nms_post_max_size: 300
+    nms_score_threshold: 0.3 #0.05
+    nms_iou_threshold: 0.1 #0.5
+
+    use_bev: false
+    num_point_features: 4
+    without_reflectivity: false
+    box_coder: {
+      ground_box3d_coder: {
+        linear_dim: false
+        encode_angle_vector: false
+      }
+    }
+    target_assigner: {
+      anchor_generators: {
+         anchor_generator_stride: {
+           sizes: [1.6, 3.9, 1.56] # wlh
+           strides: [0.32, 0.32, 0.0] # if generate only 1 z_center, z_stride will be ignored
+           offsets: [0.16, -39.52, -1.78] # origin_offset + strides / 2
+           rotations: [0, 1.57] # 0, pi/2
+           matched_threshold : 0.6
+           unmatched_threshold : 0.45
+         }
+       }
+
+      sample_positive_fraction : -1
+      sample_size : 512
+      region_similarity_calculator: {
+        nearest_iou_similarity: {
+        }
+      }
+    }
+  }
+}
+
+
+train_input_reader: {
+  record_file_path: "kitti_train.tfrecord"
+  class_names: ["Car"]
+  max_num_epochs : 160
+  batch_size: 1
+  prefetch_size : 25
+  max_number_of_voxels: 12000
+  shuffle_points: true
+  num_workers: 2
+  groundtruth_localization_noise_std: [0.25, 0.25, 0.25]
+  groundtruth_rotation_uniform_noise: [-0.15707963267, 0.15707963267]
+  global_rotation_uniform_noise: [-0.78539816, 0.78539816]
+  global_scaling_uniform_noise: [0.95, 1.05]
+  global_random_rotation_range_per_object: [0, 0]
+  anchor_area_threshold: 1
+  remove_points_after_sample: false
+  groundtruth_points_drop_percentage: 0.0
+  groundtruth_drop_max_keep_points: 15
+  database_sampler {
+    database_info_path: "kitti_dbinfos_train.pkl"
+    sample_groups {
+      name_to_max_num {
+        key: "Car"
+        value: 15
+      }
+    }
+    database_prep_steps {
+      filter_by_min_num_points {
+        min_num_point_pairs {
+          key: "Car"
+          value: 5
+        }
+      }
+    }
+    database_prep_steps {
+      filter_by_difficulty {
+        removed_difficulties: [-1]
+      }
+    }
+    global_random_rotation_range_per_object: [0, 0]
+    rate: 1.0
+  }
+
+  remove_unknown_examples: false
+  remove_environment: false
+  kitti_info_path: "kitti_infos_train.pkl"
+  kitti_root_path: ""
+}
+
+train_config: {
+
+  inter_op_parallelism_threads: 4
+  intra_op_parallelism_threads: 4
+  steps: 296960 # 1856 steps per epoch * 160 epochs
+  steps_per_eval: 9280 # 1856 steps per epoch * 5 epochs
+  save_checkpoints_secs : 1800 # half hour
+  save_summary_steps : 10
+  enable_mixed_precision: false
+  loss_scale_factor : 512.0
+  clear_metrics_every_epoch: false
+}
+
+eval_input_reader: {
+  record_file_path: "kitti_val.tfrecord"
+  class_names: ["Car"]
+  batch_size: 1
+  max_num_epochs : 160
+  prefetch_size : 25
+  max_number_of_voxels: 12000
+  shuffle_points: false
+  num_workers: 1
+  anchor_area_threshold: 1
+  remove_environment: false
+  kitti_info_path: "kitti_infos_val.pkl"
+  kitti_root_path: ""
+}
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/vnn_tapp_16.proto b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/vnn_tapp_16.proto
new file mode 100644
index 0000000000..23c5baae5b
--- /dev/null
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/configs/tanet/car/vnn_tapp_16.proto
@@ -0,0 +1,173 @@
+model: {
+  second: {
+    voxel_generator {
+      point_cloud_range : [0, -39.68, -3, 69.12, 39.68, 1]
+      voxel_size : [0.16, 0.16, 4]
+      max_number_of_points_per_voxel : 100
+    }
+    num_class: 1
+    voxel_feature_extractor: {
+      module_class_name: "PillarFeature_TANet"
+      num_filters: [64]
+      with_distance: false
+    }
+    middle_feature_extractor: {
+      module_class_name: "PointPillarsScatter"
+    }
+    rpn: {
+      module_class_name: "VRPN"
+      layer_nums: [3, 5, 5]
+      layer_strides: [2, 2, 2]
+      num_filters: [64, 128, 256]
+      upsample_strides: [1, 2, 4]
+      num_upsample_filters: [128, 128, 128]
+      use_groupnorm: false
+      num_groups: 32
+    }
+    loss: {
+      classification_loss: {
+        weighted_sigmoid_focal: {
+          alpha: 0.25
+          gamma: 2.0
+          anchorwise_output: true
+        }
+      }
+      localization_loss: {
+        weighted_smooth_l1: {
+          sigma: 3.0
+          code_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        }
+      }
+      classification_weight: 1.0
+      localization_weight: 2.0
+    }
+    # Outputs
+    use_sigmoid_score: true
+    encode_background_as_zeros: true
+    encode_rad_error_by_sin: true
+
+    use_direction_classifier: true
+    direction_loss_weight: 0.2
+    use_aux_classifier: false
+    # Loss
+    pos_class_weight: 1.0
+    neg_class_weight: 1.0
+
+    loss_norm_type: NormByNumPositives
+    # Postprocess
+    post_center_limit_range: [0, -39.68, -5, 69.12, 39.68, 5]
+    use_rotate_nms: false
+    use_multi_class_nms: false
+    nms_pre_max_size: 1000
+    nms_post_max_size: 300
+    nms_score_threshold: 0.3 #0.05
+    nms_iou_threshold: 0.1 #0.5
+
+    use_bev: false
+    num_point_features: 4
+    without_reflectivity: false
+    box_coder: {
+      ground_box3d_coder: {
+        linear_dim: false
+        encode_angle_vector: false
+      }
+    }
+    target_assigner: {
+      anchor_generators: {
+         anchor_generator_stride: {
+           sizes: [1.6, 3.9, 1.56] # wlh
+           strides: [0.32, 0.32, 0.0] # if generate only 1 z_center, z_stride will be ignored
+           offsets: [0.16, -39.52, -1.78] # origin_offset + strides / 2
+           rotations: [0, 1.57] # 0, pi/2
+           matched_threshold : 0.6
+           unmatched_threshold : 0.45
+         }
+       }
+
+      sample_positive_fraction : -1
+      sample_size : 512
+      region_similarity_calculator: {
+        nearest_iou_similarity: {
+        }
+      }
+    }
+  }
+}
+
+
+train_input_reader: {
+  record_file_path: "kitti_train.tfrecord"
+  class_names: ["Car"]
+  max_num_epochs : 160
+  batch_size: 1
+  prefetch_size : 25
+  max_number_of_voxels: 12000
+  shuffle_points: true
+  num_workers: 2
+  groundtruth_localization_noise_std: [0.25, 0.25, 0.25]
+  groundtruth_rotation_uniform_noise: [-0.15707963267, 0.15707963267]
+  global_rotation_uniform_noise: [-0.78539816, 0.78539816]
+  global_scaling_uniform_noise: [0.95, 1.05]
+  global_random_rotation_range_per_object: [0, 0]
+  anchor_area_threshold: 1
+  remove_points_after_sample: false
+  groundtruth_points_drop_percentage: 0.0
+  groundtruth_drop_max_keep_points: 15
+  database_sampler {
+    database_info_path: "kitti_dbinfos_train.pkl"
+    sample_groups {
+      name_to_max_num {
+        key: "Car"
+        value: 15
+      }
+    }
+    database_prep_steps {
+      filter_by_min_num_points {
+        min_num_point_pairs {
+          key: "Car"
+          value: 5
+        }
+      }
+    }
+    database_prep_steps {
+      filter_by_difficulty {
+        removed_difficulties: [-1]
+      }
+    }
+    global_random_rotation_range_per_object: [0, 0]
+    rate: 1.0
+  }
+
+  remove_unknown_examples: false
+  remove_environment: false
+  kitti_info_path: "kitti_infos_train.pkl"
+  kitti_root_path: ""
+}
+
+train_config: {
+
+  inter_op_parallelism_threads: 4
+  intra_op_parallelism_threads: 4
+  steps: 296960 # 1856 steps per epoch * 160 epochs
+  steps_per_eval: 9280 # 1856 steps per epoch * 5 epochs
+  save_checkpoints_secs : 1800 # half hour
+  save_summary_steps : 10
+  enable_mixed_precision: false
+  loss_scale_factor : 512.0
+  clear_metrics_every_epoch: false
+}
+
+eval_input_reader: {
+  record_file_path: "kitti_val.tfrecord"
+  class_names: ["Car"]
+  batch_size: 1
+  max_num_epochs : 160
+  prefetch_size : 25
+  max_number_of_voxels: 12000
+  shuffle_points: false
+  num_workers: 1
+  anchor_area_threshold: 1
+  remove_environment: false
+  kitti_info_path: "kitti_infos_val.pkl"
+  kitti_root_path: ""
+}
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/data/kitti_common.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/data/kitti_common.py
index c074dcd736..9d15a92a9a 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/data/kitti_common.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/data/kitti_common.py
@@ -648,7 +648,7 @@ def get_pseudo_label_anno():
     return annotations
 
 
-def get_start_result_anno():
+def get_start_result_anno(return_uncertainty):
     annotations = {}
     annotations.update({
         "name": [],
@@ -661,6 +661,18 @@ def get_start_result_anno():
         "rotation_y": [],
         "score": [],
     })
+
+    if return_uncertainty:
+        annotations.update({
+            "variance_truncated": [],
+            "variance_occluded": [],
+            "variance_alpha": [],
+            "variance_bbox": [],
+            "variance_dimensions": [],
+            "variance_location": [],
+            "variance_rotation_y": [],
+            "variance_score": [],
+        })
     return annotations
 
 
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_coders.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_coders.py
index 0c3da0d1c2..d4d413394d 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_coders.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_coders.py
@@ -18,6 +18,10 @@ class GroundBox3dCoderTorch(GroundBox3dCoder):
         return box_torch_ops.second_box_decode(
             boxes, anchors, self.vec_encode, self.linear_dim
         )
+    def decode_torch_uncertainty(self, boxes_var, boxes_mean, anchors):
+        return box_torch_ops.second_box_decode_uncertainty(
+            boxes_var, boxes_mean, anchors, self.vec_encode, self.linear_dim
+        )
 
 
 class BevBoxCoderTorch(BevBoxCoder):
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_torch_ops.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_torch_ops.py
index 050ea5ae69..c36bea8317 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_torch_ops.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/core/box_torch_ops.py
@@ -102,6 +102,61 @@ def second_box_decode(box_encodings,
     return torch.cat([xg, yg, zg, wg, lg, hg, rg], dim=-1)
 
 
+
+def second_box_decode_uncertainty(
+    box_var_encodings,
+    box_mean_encodings,
+    anchors,
+    encode_angle_to_vector=False,
+    smooth_dim=False):
+    """box decode for VoxelNet in lidar
+    Args:
+        boxes ([N, 7] Tensor): normal boxes: x, y, z, w, l, h, r
+        anchors ([N, 7] Tensor): anchors
+    """
+    xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
+    if encode_angle_to_vector:
+        xt, yt, zt, wt, lt, ht, rtx, rty = torch.split(
+            box_var_encodings,
+            1,
+            dim=-1
+        )
+        mxt, myt, mzt, mwt, mlt, mht, mrtx, mrty = torch.split(
+            box_mean_encodings,
+            1,
+            dim=-1
+        )
+
+    else:
+        xt, yt, zt, wt, lt, ht, rt = torch.split(box_var_encodings, 1, dim=-1)
+        mxt, myt, mzt, mwt, mlt, mht, mrt = torch.split(box_mean_encodings, 1, dim=-1)
+
+    def exp_var(m, s):
+        return (torch.exp(s ** 2) - 1) * (torch.exp(2 * m  + s ** 2))
+
+    za = za + ha / 2
+    diagonal = torch.sqrt(la**2 + wa**2)
+    xg = xt * diagonal ** 2
+    yg = yt * diagonal ** 2
+    zg = zt * ha ** 2
+    if smooth_dim:
+        lg = lt * la ** 2
+        wg = wt * wa ** 2
+        hg = ht * ha ** 2
+    else:
+        lg = exp_var(mlt, lt) * la ** 2
+        wg = exp_var(mwt, wt) * wa ** 2
+        hg = exp_var(mht, ht) * ha ** 2
+    if encode_angle_to_vector:
+        rgx = rtx
+        rgy = rty
+        rg = torch.atan2(rgy, rgx)
+    else:
+        rg = rt
+    return torch.cat([xg, yg, zg, wg, lg, hg, rg], dim=-1)
+
+
+
 def bev_box_encode(boxes,
                    anchors,
                    encode_angle_to_vector=False,
@@ -372,6 +427,16 @@ def project_to_image(points_3d, proj_mat):
     return point_2d_res
 
 
+def uncertainty_project_to_image(points_3d, proj_mat):
+    points_num = list(points_3d.shape)[:-1]
+    points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
+    points_4 = torch.cat(
+        [points_3d, torch.zeros(*points_shape).type_as(points_3d)], dim=-1)
+    point_2d = points_4 @ (proj_mat.t() ** 2)
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+    return point_2d_res
+
+
 def camera_to_lidar(points, r_rect, velo2cam):
     num_points = points.shape[0]
     points = torch.cat(
@@ -380,6 +445,17 @@ def camera_to_lidar(points, r_rect, velo2cam):
     return lidar_points[..., :3]
 
 
+def uncertainty_camera_to_lidar(points, r_rect, velo2cam):
+    num_points = points.shape[0]
+    points = torch.cat(
+        [points, torch.ones(num_points, 1).type_as(points)], dim=-1)
+    
+    M = torch.inverse((r_rect @ velo2cam).t())
+    
+    lidar_points = points @ (M ** 2)
+    return lidar_points[..., :3]
+
+
 def lidar_to_camera(points, r_rect, velo2cam):
     num_points = points.shape[0]
     points = torch.cat(
@@ -396,6 +472,14 @@ def box_camera_to_lidar(data, r_rect, velo2cam):
     return torch.cat([xyz_lidar, w, l, h, r], dim=-1)
 
 
+def uncertainty_box_camera_to_lidar(data, r_rect, velo2cam):
+    xyz = data[..., 0:3]
+    l, h, w = data[..., 3:4], data[..., 4:5], data[..., 5:6]
+    r = data[..., 6:7]
+    xyz_lidar = uncertainty_camera_to_lidar(xyz, r_rect, velo2cam)
+    return torch.cat([xyz_lidar, w, l, h, r], dim=-1)
+
+
 def box_lidar_to_camera(data, r_rect, velo2cam):
     xyz_lidar = data[..., 0:3]
     w, l, h = data[..., 3:4], data[..., 4:5], data[..., 5:6]
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/tanet.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/tanet.py
index b4d2bf2c8b..1131da76dd 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/tanet.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/tanet.py
@@ -930,7 +930,7 @@ class VPSA(nn.Module):
         all_box_preds = []
         all_cls_preds = []
         all_dir_cls_preds = []
-        all_refine_box_preds = []
+        all_refine_loc_preds = []
         all_refine_cls_preds = []
         all_refine_dir_cls_preds = []
 
@@ -994,8 +994,8 @@ class VPSA(nn.Module):
             refine_cls_preds_local = self.refine_cls(PSA_output)
             refine_loc_preds_local = self.refine_loc(PSA_output)
             
-            all_refine_box_preds.append(refine_cls_preds_local)
-            all_refine_cls_preds.append(refine_loc_preds_local)
+            all_refine_loc_preds.append(refine_loc_preds_local)
+            all_refine_cls_preds.append(refine_cls_preds_local)
 
             if self._use_direction_classifier:
                 refine_dir_preds_local = self.refine_dir(PSA_output)
@@ -1021,8 +1021,8 @@ class VPSA(nn.Module):
             torch.stack(all_cls_preds, dim=0), dim=0, unbiased=False
         )
 
-        refine_box_preds_var, refine_box_preds = torch.var_mean(
-            torch.stack(all_refine_box_preds, dim=0), dim=0, unbiased=False
+        refine_loc_preds_var, refine_loc_preds = torch.var_mean(
+            torch.stack(all_refine_loc_preds, dim=0), dim=0, unbiased=False
         )
         refine_cls_preds_var, refine_cls_preds = torch.var_mean(
             torch.stack(all_refine_cls_preds, dim=0), dim=0, unbiased=False
@@ -1033,9 +1033,9 @@ class VPSA(nn.Module):
         cls_preds = cls_preds.permute(0, 2, 3, 1).contiguous()
         box_preds_var = box_preds_var.permute(0, 2, 3, 1).contiguous()
         cls_preds_var = cls_preds_var.permute(0, 2, 3, 1).contiguous()
-        refine_box_preds = refine_box_preds.permute(0, 2, 3, 1).contiguous()
+        refine_loc_preds = refine_loc_preds.permute(0, 2, 3, 1).contiguous()
         refine_cls_preds = refine_cls_preds.permute(0, 2, 3, 1).contiguous()
-        refine_box_preds_var = refine_box_preds_var.permute(0, 2, 3, 1).contiguous()
+        refine_loc_preds_var = refine_loc_preds_var.permute(0, 2, 3, 1).contiguous()
         refine_cls_preds_var = refine_cls_preds_var.permute(0, 2, 3, 1).contiguous()
 
         ret_dict = {
@@ -1043,9 +1043,9 @@ class VPSA(nn.Module):
             "cls_preds": cls_preds,
             "box_preds_var": box_preds_var,
             "cls_preds_var": cls_preds_var,
-            "Refine_box_preds": refine_box_preds,
+            "Refine_loc_preds": refine_loc_preds,
             "Refine_cls_preds": refine_cls_preds,
-            "Refine_box_preds_var": refine_box_preds_var,
+            "Refine_loc_preds_var": refine_loc_preds_var,
             "Refine_cls_preds_var": refine_cls_preds_var,
         }
 
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/voxelnet.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/voxelnet.py
index bbee7d580b..cda9f04d66 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/voxelnet.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/pytorch/models/voxelnet.py
@@ -1100,7 +1100,7 @@ class VoxelNet(nn.Module):
     def get_global_step(self):
         return int(self.global_step.cpu().numpy()[0])
 
-    def forward(self, example, refine_weight=2, samples=1):
+    def forward(self, example, refine_weight=2, samples=1, return_uncertainty=False):
         """module's forward should always accept dict and return loss.
         """
         voxels = example["voxels"]
@@ -1303,18 +1303,21 @@ class VoxelNet(nn.Module):
                     "cared": cared,
                 }
         else:
-            if self.rpn_class_name == "PSA" or self.rpn_class_name == "RefineDet":
-                coarse_output = self.predict_coarse(example, preds_dict, self.device)
-                refine_output = self.predict_refine(example, preds_dict, self.device)
+            if self.rpn_class_name in ["PSA", "VPSA", "RefineDet"]:
+                coarse_output = self.predict_coarse(example, preds_dict, self.device, return_uncertainty=return_uncertainty)
+                refine_output = self.predict_refine(example, preds_dict, self.device, return_uncertainty=return_uncertainty)
                 return coarse_output, refine_output
             else:
-                return self.predict_coarse(example, preds_dict, self.device)
+                return self.predict_coarse(example, preds_dict, self.device, return_uncertainty=return_uncertainty)
 
     def compute_predict(
         self,
         batch_box_preds,
         batch_cls_preds,
         batch_dir_preds,
+        batch_box_preds_var,
+        batch_cls_preds_var,
+        batch_dir_preds_var,
         batch_rect,
         batch_Trv2c,
         batch_P2,
@@ -1322,12 +1325,16 @@ class VoxelNet(nn.Module):
         batch_anchors_mask,
         num_class_with_bg,
         device,
+        return_uncertainty,
     ):
         predictions_dicts = []
         for (
                 box_preds,
                 cls_preds,
                 dir_preds,
+                box_preds_var,
+                cls_preds_var,
+                dir_preds_var,
                 rect,
                 Trv2c,
                 P2,
@@ -1337,6 +1344,9 @@ class VoxelNet(nn.Module):
                 batch_box_preds,
                 batch_cls_preds,
                 batch_dir_preds,
+                batch_box_preds_var,
+                batch_cls_preds_var,
+                batch_dir_preds_var,
                 batch_rect,
                 batch_Trv2c,
                 batch_P2,
@@ -1350,9 +1360,16 @@ class VoxelNet(nn.Module):
                 a_mask = a_mask.bool()
                 box_preds = box_preds[a_mask]
                 cls_preds = cls_preds[a_mask]
+
+                if return_uncertainty:
+                    box_preds_var = box_preds_var[a_mask]
+                    cls_preds_var = cls_preds_var[a_mask]
             if self._use_direction_classifier:
                 if a_mask is not None:
                     dir_preds = dir_preds[a_mask]
+
+                    if return_uncertainty:
+                        dir_preds_var = dir_preds_var[a_mask]
                 dir_labels = torch.max(dir_preds, dim=-1)[1]
             if self._encode_background_as_zeros:
                 # this don't support softmax
@@ -1398,27 +1415,45 @@ class VoxelNet(nn.Module):
                 )
                 selected_boxes, selected_labels, selected_scores = [], [], []
                 selected_dir_labels = []
+                selected_boxes_var = []
+                selected_cls_preds_var = []
+                selected_dir_preds_var = []
                 for i, selected in enumerate(selected_per_class):
                     if selected is not None:
                         num_dets = selected.shape[0]
                         selected_boxes.append(box_preds[selected])
                         selected_labels.append(
                             torch.full([num_dets], i, dtype=torch.int64))
+                        selected_boxes.append(box_preds[selected])
                         if self._use_direction_classifier:
                             selected_dir_labels.append(dir_labels[selected])
                         selected_scores.append(total_scores[selected, i])
+
+                        if return_uncertainty:
+                            selected_boxes_var.append(box_preds_var[selected])
+                            selected_dir_preds_var.append(dir_preds_var[selected])
+                            selected_cls_preds_var.append(cls_preds_var[selected])
                 if len(selected_boxes) > 0:
                     selected_boxes = torch.cat(selected_boxes, dim=0)
                     selected_labels = torch.cat(selected_labels, dim=0)
                     selected_scores = torch.cat(selected_scores, dim=0)
+                    if return_uncertainty:
+                        selected_boxes_var = torch.cat(selected_boxes_var, dim=0)
+                        selected_cls_preds_var = torch.cat(selected_cls_preds_var, dim=0)
                     if self._use_direction_classifier:
-                        selected_dir_labels = torch.cat(selected_dir_labels,
-                                                        dim=0)
+                        selected_dir_labels = torch.cat(
+                            selected_dir_labels,
+                            dim=0
+                        )
+                        if return_uncertainty:
+                            selected_dir_preds_var = torch.cat(selected_dir_preds_var, dim=0)
                 else:
                     selected_boxes = None
                     selected_labels = None
                     selected_scores = None
-                    selected_dir_labels = None
+                    selected_boxes_var = None
+                    selected_cls_preds_var = None
+                    selected_dir_preds_var = None
             else:
                 # get highest score per prediction, than apply nms
                 # to remove overlapped box.
@@ -1470,12 +1505,20 @@ class VoxelNet(nn.Module):
                         selected_dir_labels = dir_labels[selected]
                     selected_labels = top_labels[selected]
                     selected_scores = top_scores[selected]
+                    if return_uncertainty:
+                        selected_boxes_var = box_preds_var[selected]
+                        selected_cls_preds_var = cls_preds_var[selected]
+                        selected_dir_preds_var = dir_preds_var[selected]
             # finally generate predictions.
 
             if selected_boxes is not None:
                 box_preds = selected_boxes
                 scores = selected_scores
                 label_preds = selected_labels
+                if return_uncertainty:
+                    box_preds_var = selected_boxes_var
+                    cls_preds_var = selected_cls_preds_var
+                    dir_preds_var = selected_dir_preds_var
                 if self._use_direction_classifier:
                     dir_labels = selected_dir_labels
                     opp_labels = dir_labels.byte() ^ (
@@ -1488,10 +1531,18 @@ class VoxelNet(nn.Module):
                     )
                 final_box_preds = box_preds
                 final_scores = scores
+                if return_uncertainty:
+                    final_box_preds_var_camera = None
+                    box_2d_preds_var = None
 
                 if is_calib:
                     final_box_preds_camera = box_torch_ops.box_lidar_to_camera(
                         final_box_preds, rect, Trv2c)
+
+                    if return_uncertainty:
+                        final_box_preds_var_camera = box_torch_ops.uncertainty_box_camera_to_lidar(
+                            box_preds_var, rect, Trv2c
+                        )
                     locs = final_box_preds_camera[:, :3]
                     dims = final_box_preds_camera[:, 3:6]
                     angles = final_box_preds_camera[:, 6]
@@ -1504,31 +1555,42 @@ class VoxelNet(nn.Module):
                     minxy = torch.min(box_corners_in_image, dim=1)[0]
                     maxxy = torch.max(box_corners_in_image, dim=1)[0]
                     box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+
+                    if return_uncertainty:
+                        box_2d_preds_var = torch.zeros_like(box_2d_preds)
                 else:
                     box_2d_preds = None
                     final_box_preds_camera = None
                 # predictions
                 predictions_dict = {
                     "bbox": box_2d_preds,
+                    "bbox_var": box_2d_preds_var if return_uncertainty else None,
                     "box3d_camera": final_box_preds_camera,
+                    "box3d_camera_var": final_box_preds_var_camera if return_uncertainty else None,
                     "box3d_lidar": final_box_preds,
+                    "box3d_lidar_var": box_preds_var if return_uncertainty else None,
                     "scores": final_scores,
+                    "scores_var": cls_preds_var if return_uncertainty else None,
                     "label_preds": label_preds,
                     "image_idx": img_idx,
                 }
             else:
                 predictions_dict = {
                     "bbox": None,
+                    "bbox_var": None,
                     "box3d_camera": None,
+                    "box3d_camera_var": None,
                     "box3d_lidar": None,
+                    "box3d_lidar_var": None,
                     "scores": None,
+                    "scores_var": None,
                     "label_preds": None,
                     "image_idx": img_idx,
                 }
             predictions_dicts.append(predictions_dict)
         return predictions_dicts
 
-    def predict_coarse(self, example, preds_dict, device):
+    def predict_coarse(self, example, preds_dict, device, return_uncertainty):
         t = time.time()
         batch_size = example["anchors"].shape[0]
         batch_anchors = example["anchors"].view(batch_size, -1, 7)
@@ -1556,38 +1618,79 @@ class VoxelNet(nn.Module):
         t = time.time()
         batch_box_preds = preds_dict["box_preds"]
         batch_cls_preds = preds_dict["cls_preds"]
-        batch_box_preds = batch_box_preds.view(batch_size, -1,
-                                               self._box_coder.code_size)
+        batch_box_preds = batch_box_preds.view(
+            batch_size, -1,
+            self._box_coder.code_size
+        )
+
+        batch_box_preds_var = [None] * batch_size
+        batch_cls_preds_var = [None] * batch_size
+
+        if return_uncertainty:
+            batch_box_preds_var = preds_dict["box_preds_var"]
+            batch_cls_preds_var = preds_dict["cls_preds_var"]
+            batch_box_preds_var = batch_box_preds_var.view(
+                batch_size, -1,
+                self._box_coder.code_size
+            )
         num_class_with_bg = self._num_class
         if not self._encode_background_as_zeros:
             num_class_with_bg = self._num_class + 1
 
-        batch_cls_preds = batch_cls_preds.view(batch_size, -1,
-                                               num_class_with_bg)
-        batch_box_preds = self._box_coder.decode_torch(batch_box_preds,
-                                                       batch_anchors)
+        if return_uncertainty:
+            batch_cls_preds_var = batch_cls_preds_var.view(
+                batch_size, -1,
+                num_class_with_bg
+            )
+            batch_box_preds_var = self._box_coder.decode_torch_uncertainty(
+                batch_box_preds_var,
+                batch_box_preds,
+                batch_anchors
+            )
+
+        batch_cls_preds = batch_cls_preds.view(
+            batch_size, -1,
+            num_class_with_bg
+        )
+
+        batch_box_preds = self._box_coder.decode_torch(
+            batch_box_preds,
+            batch_anchors
+        )
+
         if self._use_direction_classifier:
             batch_dir_preds = preds_dict["dir_cls_preds"]
             batch_dir_preds = batch_dir_preds.view(batch_size, -1, 2)
+
+            batch_dir_preds_var = [None] * batch_size
+
+            if return_uncertainty:
+                batch_dir_preds_var = preds_dict["dir_cls_preds_var"]
+                batch_dir_preds_var = batch_dir_preds_var.view(batch_size, -1, 2)
         else:
             batch_dir_preds = [None] * batch_size
+            batch_dir_preds_var = [None] * batch_size
 
         predictions_dicts = self.compute_predict(
             batch_box_preds,
             batch_cls_preds,
             batch_dir_preds,
+            batch_box_preds_var,
+            batch_cls_preds_var,
+            batch_dir_preds_var,
             batch_rect,
             batch_Trv2c,
             batch_P2,
             batch_imgidx,
             batch_anchors_mask,
             num_class_with_bg,
-            device=device
+            device=device,
+            return_uncertainty=return_uncertainty,
         )
         self._total_postprocess_time += time.time() - t
         return predictions_dicts
 
-    def predict_refine(self, example, preds_dict, device):
+    def predict_refine(self, example, preds_dict, device, return_uncertainty):
         t = time.time()
         batch_size = example["anchors"].shape[0]
         batch_anchors = example["anchors"].view(batch_size, -1, 7)
@@ -1623,39 +1726,89 @@ class VoxelNet(nn.Module):
         refine_box_preds = preds_dict["Refine_loc_preds"]
         refine_cls_preds = preds_dict["Refine_cls_preds"]
 
-        coarse_box_preds = coarse_box_preds.view(batch_size, -1,
-                                                 self._box_coder.code_size)
+        refine_box_preds_var = [None] * batch_size
+        refine_cls_preds_var = [None] * batch_size
 
-        refine_box_preds = refine_box_preds.view(batch_size, -1,
-                                                 self._box_coder.code_size)
+        if return_uncertainty:
+            refine_box_preds_var = preds_dict["Refine_loc_preds_var"]
+            refine_cls_preds_var = preds_dict["Refine_cls_preds_var"]
 
-        de_coarse_boxes = self._box_coder.decode_torch(coarse_box_preds,
-                                                       batch_anchors)
-        de_refine_boxes = self._box_coder.decode_torch(refine_box_preds,
-                                                       de_coarse_boxes)
+        coarse_box_preds = coarse_box_preds.view(
+            batch_size, -1,
+            self._box_coder.code_size
+        )
+
+        refine_box_preds = refine_box_preds.view(
+            batch_size, -1,
+            self._box_coder.code_size
+        )
+
+        if return_uncertainty:
+            refine_box_preds_var = refine_box_preds_var.view(
+                batch_size, -1,
+                self._box_coder.code_size
+            )
+
+        de_coarse_boxes = self._box_coder.decode_torch(
+            coarse_box_preds,
+            batch_anchors
+        )
+        de_refine_boxes = self._box_coder.decode_torch(
+            refine_box_preds,
+            de_coarse_boxes
+        )
+        
+        if return_uncertainty:
+            de_refine_boxes_var = self._box_coder.decode_torch_uncertainty(
+                refine_box_preds_var,
+                refine_box_preds,
+                de_coarse_boxes
+            )
 
         batch_box_preds = de_refine_boxes
         batch_cls_preds = refine_cls_preds
-        batch_cls_preds = batch_cls_preds.view(batch_size, -1,
-                                               num_class_with_bg)
+        batch_cls_preds = batch_cls_preds.view(
+            batch_size, -1,
+            num_class_with_bg)
+
+        batch_cls_preds_var = [None] * batch_size
+        batch_box_preds_var = [None] * batch_size
+
+        if return_uncertainty:
+            batch_cls_preds_var = refine_cls_preds_var
+            batch_box_preds_var = de_refine_boxes_var
+            batch_cls_preds_var = batch_cls_preds_var.view(
+                batch_size, -1,
+                num_class_with_bg)
 
         if self._use_direction_classifier:
             batch_dir_preds = preds_dict["Refine_dir_preds"]
             batch_dir_preds = batch_dir_preds.view(batch_size, -1, 2)
+
+            batch_dir_preds_var = [None] * batch_size
+
+            if return_uncertainty:
+                batch_dir_preds_var = preds_dict["Refine_dir_preds_var"]
+                batch_dir_preds_var = batch_dir_preds_var.view(batch_size, -1, 2)
         else:
             batch_dir_preds = [None] * batch_size
+            batch_dir_preds_var = [None] * batch_size
 
         predictions_dicts = self.compute_predict(
             batch_box_preds,
             batch_cls_preds,
             batch_dir_preds,
+            batch_box_preds_var,
+            batch_cls_preds_var,
+            batch_dir_preds_var,
             batch_rect,
             batch_Trv2c,
             batch_P2,
             batch_imgidx,
             batch_anchors_mask,
             num_class_with_bg,
-            device=device
+            device=device,
+            return_uncertainty=return_uncertainty,
         )
         self._total_postprocess_time += time.time() - t
         return predictions_dicts
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/run.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/run.py
index fa5c98f5da..6a1403982f 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/run.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/second_detector/run.py
@@ -606,6 +606,7 @@ def compute_lidar_kitti_output(
     center_limit_range,
     class_names,
     global_set,
+    return_uncertainty,
 ):
     annos = []
     for i, preds_dict in enumerate(predictions_dicts):
@@ -613,10 +614,18 @@ def compute_lidar_kitti_output(
             scores = preds_dict["scores"].detach().cpu().numpy()
             box_preds_lidar = preds_dict["box3d_lidar"].detach().cpu().numpy()
             label_preds = preds_dict["label_preds"].detach().cpu().numpy()
-            anno = kitti.get_start_result_anno()
+
+            if return_uncertainty:
+                scores_var = preds_dict["scores_var"].detach().cpu().numpy()
+                box_preds_lidar_var = preds_dict["box3d_lidar_var"].detach().cpu().numpy()
+            else:
+                scores_var = np.empty_like(scores)
+                box_preds_lidar_var = np.empty_like(box_preds_lidar)
+
+            anno = kitti.get_start_result_anno(return_uncertainty=return_uncertainty)
             num_example = 0
-            for box_lidar, score, label in zip(
-                box_preds_lidar, scores, label_preds
+            for box_lidar, box_lidar_var, score, score_var, label in zip(
+                box_preds_lidar, box_preds_lidar_var, scores, scores_var, label_preds
             ):
                 if center_limit_range is not None:
                     limit_range = np.array(center_limit_range)
@@ -631,6 +640,18 @@ def compute_lidar_kitti_output(
                 anno["dimensions"].append(box_lidar[3:6])
                 anno["location"].append(box_lidar[:3])
                 anno["rotation_y"].append(box_lidar[6])
+                anno["score"].append(score)
+
+                if return_uncertainty:
+                    anno["variance_truncated"].append(0.0)
+                    anno["variance_occluded"].append(0)
+                    anno["variance_alpha"].append(0)
+                    anno["variance_bbox"].append(None)
+                    anno["variance_dimensions"].append(box_lidar_var[3:6])
+                    anno["variance_location"].append(box_lidar_var[:3])
+                    anno["variance_rotation_y"].append(box_lidar_var[6])
+                    anno["variance_score"].append(np.mean(score_var))
+
                 if global_set is not None:
                     for i in range(100000):
                         if score in global_set:
@@ -638,7 +659,6 @@ def compute_lidar_kitti_output(
                         else:
                             global_set.add(score)
                             break
-                anno["score"].append(score)
 
                 num_example += 1
             if num_example != 0:
diff --git a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/voxel_object_detection_3d_learner.py b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/voxel_object_detection_3d_learner.py
index 85c5a401d3..9bd003712c 100644
--- a/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/voxel_object_detection_3d_learner.py
+++ b/src/opendr/perception/object_detection_3d/voxel_object_detection_3d/voxel_object_detection_3d_learner.py
@@ -25,7 +25,7 @@ from opendr.engine.datasets import (
     ExternalDataset,
     MappedDatasetIterator,
 )
-from opendr.engine.data import PointCloud
+from opendr.engine.data import PointCloud, PointCloudWithCalibration
 from opendr.perception.object_detection_3d.voxel_object_detection_3d.second_detector.load import (
     create_model as second_create_model,
     load_from_checkpoint,
@@ -55,7 +55,7 @@ from opendr.perception.object_detection_3d.voxel_object_detection_3d.second_dete
 from opendr.perception.object_detection_3d.voxel_object_detection_3d.second_detector.data.preprocess import (
     merge_second_batch,
 )
-from opendr.engine.target import BoundingBox3DList
+from opendr.engine.target import BoundingBox3DList, UncertaintyBoundingBox3DList
 from opendr.engine.constants import OPENDR_SERVER_URL
 from urllib.request import urlretrieve
 from urllib.error import URLError
@@ -97,6 +97,7 @@ class VoxelObjectDetection3DLearner(Learner):
             "decay_factor": 0.8,
             "staircase": True,
         },
+        return_uncertainty=False,
     ):
         # Pass the shared parameters on super's constructor so they can get initialized as class attributes
         super(VoxelObjectDetection3DLearner, self).__init__(
@@ -122,6 +123,7 @@ class VoxelObjectDetection3DLearner(Learner):
         self.model_dir = None
         self.eval_checkpoint_dir = None
         self.infer_point_cloud_mapper = None
+        self.calib_infer_point_cloud_mapper = None
 
         if tanet_config_path is not None:
             set_tanet_config(tanet_config_path)
@@ -131,6 +133,7 @@ class VoxelObjectDetection3DLearner(Learner):
         self.model.rpn_ort_session = None  # ONNX runtime inference session
         self.input_config_prepared = False
         self.eval_config_prepared = False
+        self.return_uncertainty = return_uncertainty
 
     def save(self, path, verbose=False):
         """
@@ -431,7 +434,7 @@ class VoxelObjectDetection3DLearner(Learner):
 
         return result
 
-    def infer(self, point_clouds):
+    def infer(self, point_clouds, samples=1):
 
         if self.model is None:
             raise ValueError("No model loaded or created")
@@ -452,33 +455,72 @@ class VoxelObjectDetection3DLearner(Learner):
             self.infer_point_cloud_mapper = infer_point_cloud_mapper
             self.model.eval()
 
+        if self.calib_infer_point_cloud_mapper is None:
+
+            def create_map_point_cloud_dataset_func():
+
+                prep_func = create_prep_func(
+                    self.input_config,
+                    self.model_config,
+                    False,
+                    self.voxel_generator,
+                    self.target_assigner,
+                    use_sampler=False,
+                )
+
+                def map(point_cloud_with_calibration):
+
+                    point_cloud = point_cloud_with_calibration.data
+                    calib = point_cloud_with_calibration.calib
+
+                    example = _prep_v9(point_cloud, calib, prep_func)
+
+                    if point_cloud_with_calibration.image_shape is not None:
+                        example["image_shape"] = point_cloud_with_calibration.image_shape
+
+                    return example
+
+                return map
+
+            self.calib_infer_point_cloud_mapper = create_map_point_cloud_dataset_func()
+            self.model.eval()
+
         input_data = None
 
-        if isinstance(point_clouds, PointCloud):
-            input_data = merge_second_batch(
-                [self.infer_point_cloud_mapper(point_clouds.data)]
-            )
+        def map_single(point_cloud):
+            if isinstance(point_clouds, PointCloudWithCalibration):
+                return self.calib_infer_point_cloud_mapper(point_cloud)
+            elif isinstance(point_clouds, PointCloud):
+                return self.infer_point_cloud_mapper(point_cloud.data)
+            else:
+                raise ValueError("PointCloud or PointCloudWithCalibration expected")
+
+        if isinstance(point_clouds, (PointCloud, PointCloudWithCalibration)):
+            input_data = merge_second_batch([map_single(point_clouds)])
         elif isinstance(point_clouds, list):
             input_data = merge_second_batch(
-                [self.infer_point_cloud_mapper(x.data) for x in point_clouds]
+                [map_single(x) for x in point_clouds]
             )
         else:
             return ValueError(
-                "point_clouds should be a PointCloud or a list of PointCloud"
+                "point_clouds should be a PointCloud or a list of PointClouds"
             )
 
         output = self.model(
-            example_convert_to_torch(input_data, self.float_dtype, device=self.device,)
+            example_convert_to_torch(input_data, self.float_dtype, device=self.device), samples=samples, return_uncertainty=self.return_uncertainty,
         )
 
-        if self.model_config.rpn.module_class_name == "PSA" or self.model_config.rpn.module_class_name == "RefineDet":
+        if self.model_config.rpn.module_class_name in ["PSA", "VPSA", "RefineDet"]:
             output = output[-1]
 
         annotations = compute_lidar_kitti_output(
-            output, self.center_limit_range, self.class_names, None
+            output, self.center_limit_range, self.class_names, None, return_uncertainty=self.return_uncertainty
         )
 
-        result = [BoundingBox3DList.from_kitti(anno) for anno in annotations]
+        if self.return_uncertainty:
+            result = [UncertaintyBoundingBox3DList.from_kitti(anno) for anno in annotations]
+        else:
+            result = [BoundingBox3DList.from_kitti(anno) for anno in annotations]
 
         if isinstance(point_clouds, PointCloud):
             return result[0]
-- 
GitLab