Merge branch 'develop' into merge_master_into_develop

26e0ae76 · ad-daniel · GitHub · 8cf2adbd · 2a0e6418 · 26e0ae76
Unverified Commit 26e0ae76 authored 2 years ago by ad-daniel Committed by GitHub 2 years ago
--- a/projects/opendr_ws/src/perception/scripts/video_activity_recognition.py
+++ b/projects/opendr_ws/src/perception/scripts/video_activity_recognition.py
@@ -13,12 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+import argparse
 import rospy
 import torch
 import torchvision
 import cv2
-import numpy as np
 from pathlib import Path
 from std_msgs.msg import String
 from vision_msgs.msg import ObjectHypothesis
@@ -31,20 +30,19 @@ from opendr.perception.activity_recognition import X3DLearner


 class HumanActivityRecognitionNode:
-
    def __init__(
        self,
-        input_image_topic="/usb_cam/image_raw",
+        input_rgb_image_topic="/usb_cam/image_raw",
        output_category_topic="/opendr/human_activity_recognition",
        output_category_description_topic="/opendr/human_activity_recognition_description",
        device="cuda",
-        model='cox3d-m'
+        model="cox3d-m",
    ):
        """
-        Creates a ROS Node for face recognition
-        :param input_image_topic: Topic from which we are reading the input image
-        :type input_image_topic: str
-        :param output_category_topic: Topic to which we are publishing the recognized face info
+        Creates a ROS Node for video-based human activity recognition.
+        :param input_rgb_image_topic: Topic from which we are reading the input image
+        :type input_rgb_image_topic: str
+        :param output_category_topic: Topic to which we are publishing the recognized activity
        (if None, we are not publishing the info)
        :type output_category_topic: str
        :param output_category_description_topic: Topic to which we are publishing the ID of the recognized action
@@ -52,12 +50,20 @@ class HumanActivityRecognitionNode:
        :type output_category_description_topic:  str
        :param device: device on which we are running inference ('cpu' or 'cuda')
        :type device: str
-        :param model:  architecture to use for human activity recognition.
+        :param model:  Architecture to use for human activity recognition.
         (Options: 'cox3d-s', 'cox3d-m', 'cox3d-l', 'x3d-xs', 'x3d-s', 'x3d-m', 'x3d-l')
        :type model: str
        """

-        assert model in {"cox3d-s", "cox3d-m", "cox3d-l", "x3d-xs", "x3d-s", "x3d-m", "x3d-l"}
+        assert model in {
+            "cox3d-s",
+            "cox3d-m",
+            "cox3d-l",
+            "x3d-xs",
+            "x3d-s",
+            "x3d-m",
+            "x3d-l",
+        }
        model_name, model_size = model.split("-")
        Learner = {"cox3d": CoX3DLearner, "x3d": X3DLearner}[model_name]

@@ -68,7 +74,9 @@ class HumanActivityRecognitionNode:

        # Set up preprocessing
        if model_name == "cox3d":
-            self.preprocess = _image_preprocess(image_size=self.learner.model_hparams["image_size"])
+            self.preprocess = _image_preprocess(
+                image_size=self.learner.model_hparams["image_size"]
+            )
        else:  # == x3d
            self.preprocess = _video_preprocess(
                image_size=self.learner.model_hparams["image_size"],
@@ -76,22 +84,32 @@ class HumanActivityRecognitionNode:
            )

        # Set up ROS topics and bridge
+        self.input_rgb_image_topic = input_rgb_image_topic
        self.hypothesis_publisher = (
-            rospy.Publisher(output_category_topic, ObjectHypothesis, queue_size=10) if output_category_topic else None
+            rospy.Publisher(output_category_topic, ObjectHypothesis, queue_size=1)
+            if output_category_topic
+            else None
        )
        self.string_publisher = (
-            rospy.Publisher(output_category_description_topic, String, queue_size=10) if output_category_topic else None
+            rospy.Publisher(output_category_description_topic, String, queue_size=1)
+            if output_category_description_topic
+            else None
        )

-        rospy.Subscriber(input_image_topic, ROS_Image, self.callback)
-
        self.bridge = ROSBridge()

    def listen(self):
        """
        Start the node and begin processing input data
        """
-        rospy.init_node('opendr_human_activity_recognition', anonymous=True)
+        rospy.init_node("opendr_human_activity_recognition", anonymous=True)
+        rospy.Subscriber(
+            self.input_rgb_image_topic,
+            ROS_Image,
+            self.callback,
+            queue_size=1,
+            buff_size=10000000,
+        )
        rospy.loginfo("Human activity recognition node started!")
        rospy.spin()

@@ -101,49 +119,43 @@ class HumanActivityRecognitionNode:
        :param data: input message
        :type data: sensor_msgs.msg.Image
        """
-        image = self.bridge.from_ros_image(data)
+        image = self.bridge.from_ros_image(data, encoding="rgb8")
        if image is None:
            return

-        x = self.preprocess(image.numpy())
+        x = self.preprocess(image.convert("channels_first", "rgb"))

        result = self.learner.infer(x)
        assert len(result) == 1
        category = result[0]
-        category.confidence = float(max(category.confidence.max()))  # Confidence for predicted class
+        category.confidence = float(category.confidence.max())  # Confidence for predicted class
        category.description = KINETICS400_CLASSES[category.data]  # Class name

        if self.hypothesis_publisher is not None:
            self.hypothesis_publisher.publish(self.bridge.to_ros_category(category))

        if self.string_publisher is not None:
-            self.string_publisher.publish(self.bridge.to_ros_category_description(category))
+            self.string_publisher.publish(
+                self.bridge.to_ros_category_description(category)
+            )


-def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+def _resize(image, size=None, inter=cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]

-    # if both the width and height are None, then return the
-    # original image
-    if width is None and height is None:
-        return image
-
-    # check to see if the width is None
-    if width is None:
-        # calculate the ratio of the height and construct the
+    if h > w:
+        # calculate the ratio of the width and construct the
        # dimensions
-        r = height / float(h)
-        dim = (int(w * r), height)
-
-    # otherwise, the height is None
+        r = size / float(w)
+        dim = (size, int(h * r))
    else:
-        # calculate the ratio of the width and construct the
+        # calculate the ratio of the height and construct the
        # dimensions
-        r = width / float(w)
-        dim = (width, int(h * r))
+        r = size / float(h)
+        dim = (int(w * r), size)

    # resize the image
    resized = cv2.resize(image, dim, interpolation=inter)
@@ -160,11 +172,11 @@ def _image_preprocess(image_size: int):
    def wrapped(frame):
        nonlocal standardize
        frame = frame.transpose((1, 2, 0))  # C, H, W -> H, W, C
-        frame = _resize(frame, height=image_size, width=image_size)
+        frame = _resize(frame, size=image_size)
        frame = torch.tensor(frame).permute((2, 0, 1))  # H, W, C -> C, H, W
        frame = frame / 255.0  # [0, 255] -> [0.0, 1.0]
        frame = standardize(frame)
-        return Image(frame, dtype=np.float)
+        return Image(frame, dtype=float)

    return wrapped

@@ -179,7 +191,7 @@ def _video_preprocess(image_size: int, window_size: int):
    def wrapped(frame):
        nonlocal frames, standardize
        frame = frame.transpose((1, 2, 0))  # C, H, W -> H, W, C
-        frame = _resize(frame, height=image_size, width=image_size)
+        frame = _resize(frame, size=image_size)
        frame = torch.tensor(frame).permute((2, 0, 1))  # H, W, C -> C, H, W
        frame = frame / 255.0  # [0, 255] -> [0.0, 1.0]
        frame = standardize(frame)
@@ -194,17 +206,67 @@ def _video_preprocess(image_size: int, window_size: int):
    return wrapped


-if __name__ == '__main__':
-    # Select the device for running the
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input_rgb_image_topic",
+        help="Topic name for input rgb image",
+        type=str,
+        default="/usb_cam/image_raw",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_category_topic",
+        help="Topic to which we are publishing the recognized activity",
+        type=lambda value: value if value.lower() != "none" else None,
+        default="/opendr/human_activity_recognition",
+    )
+    parser.add_argument(
+        "-od",
+        "--output_category_description_topic",
+        help="Topic to which we are publishing the ID of the recognized action",
+        type=lambda value: value if value.lower() != "none" else None,
+        default="/opendr/human_activity_recognition_description",
+    )
+    parser.add_argument(
+        "--device",
+        help='Device to use, either "cpu" or "cuda", defaults to "cuda"',
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+    )
+    parser.add_argument(
+        "--model",
+        help="Architecture to use for human activity recognition.",
+        type=str,
+        default="cox3d-m",
+        choices=["cox3d-s", "cox3d-m", "cox3d-l", "x3d-xs", "x3d-s", "x3d-m", "x3d-l"],
+    )
+    args = parser.parse_args()
+
    try:
-        if torch.cuda.is_available():
-            print("GPU found.")
-            device = 'cuda'
-        else:
+        if args.device == "cuda" and torch.cuda.is_available():
+            device = "cuda"
+        elif args.device == "cuda":
            print("GPU not found. Using CPU instead.")
-            device = 'cpu'
+            device = "cpu"
+        else:
+            print("Using CPU.")
+            device = "cpu"
    except:
-        device = 'cpu'
-
-    human_activity_recognition_node = HumanActivityRecognitionNode(device=device)
+        print("Using CPU.")
+        device = "cpu"
+
+    human_activity_recognition_node = HumanActivityRecognitionNode(
+        input_rgb_image_topic=args.input_rgb_image_topic,
+        output_category_topic=args.output_category_topic,
+        output_category_description_topic=args.output_category_description_topic,
+        device=device,
+        model=args.model,
+    )
    human_activity_recognition_node.listen()
+
+
+if __name__ == "__main__":
+    main()