From: Isaac Hermida <isaac.hermida@digi.com>
Date: Fri, 15 Sep 2023 17:02:43 +0200
Subject: [PATCH] Customize EiQ demos

Just do the changes in the scripts to:
 * Always use the NPU
 * Get a better performance with USB Cameras
 * Add option to use a bigger camera resolution
 * Option to set the window in full screen
 * Fix some exceptions of the demos

Signed-off-by: Isaac Hermida <isaac.hermida@digi.com>
---
 dms/main.py               | 79 +++++++++++++++++++++++++++------------
 face_recognition/main.py  | 44 ++++++++++++++++++----
 gesture_detection/main.py | 32 ++++++++++++++--
 object_detection/main.py  | 40 ++++++++++++++++----
 4 files changed, 153 insertions(+), 42 deletions(-)

diff --git a/dms/main.py b/dms/main.py
index 6e04dc3..8ba99b2 100644
--- a/dms/main.py
+++ b/dms/main.py
@@ -14,10 +14,16 @@ from eye_landmark import EyeMesher
 from face_landmark import FaceMesher
 from utils import *
 
-MODEL_PATH = pathlib.Path("../models/")
-DETECT_MODEL = "face_detection_front_128_full_integer_quant.tflite"
-LANDMARK_MODEL = "face_landmark_192_integer_quant.tflite"
-EYE_MODEL = "iris_landmark_quant.tflite"
+WIDTH=640
+HEIGH=480
+FLIP=None    # None, skip, 0: Flip vertically, 1: Flip horizontally (around the y-axis), -1: Flip both vertically and horizontally
+FORMAT=0     # None, skip (YUYV, default), 0 MJPG (for usb camera)
+
+# Always enforce the Ethos NPU, use the converted vela models
+MODEL_PATH = pathlib.Path("../vela_models/")
+DETECT_MODEL = "face_detection_front_128_full_integer_quant_vela.tflite"
+LANDMARK_MODEL = "face_landmark_192_integer_quant_vela.tflite"
+EYE_MODEL = "iris_landmark_quant_vela.tflite"
 
 # turn on camera
 parser = argparse.ArgumentParser()
@@ -29,16 +35,31 @@ parser.add_argument(
 parser.add_argument(
     '-d',
     '--delegate',
-    default='',
+    default='/usr/lib/libethosu_delegate.so',
     help='delegate path')
+parser.add_argument("-f", "--fullscreen", action="store_true", help='run on full screen mode')
 args = parser.parse_args()
 
 if args.input.isdigit():
     cap_input = int(args.input)
 else:
     cap_input = args.input
+
+# This pipeline for the OV5640 camera in case the other command fails
+# cap = cv2.VideoCapture("v4l2src device=%s ! imxvideoconvert_pxp ! video/x-raw,format=RGB16,width=%d,height=%d " \
+#                         "! videoconvert ! appsink" % (args.input, WIDTH, HEIGH))
+
 cap = cv2.VideoCapture(cap_input)
+cap.set(cv2.CAP_PROP_FRAME_WIDTH, WIDTH)
+cap.set(cv2.CAP_PROP_FRAME_HEIGHT, HEIGH)
+
+if FORMAT == 0:
+    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+    cap.set(cv2.CAP_PROP_FOURCC, fourcc)
+
 ret, image = cap.read()
+if FLIP is not None:
+    image = cv2.flip(image, FLIP)
 if not ret:
     print("Can't read frame from source file ", args.input)
     sys.exit(-1)
@@ -66,9 +87,9 @@ def draw_face_box(image, bboxes, landmarks, scores):
         label_btmleft = bbox[:2].copy() + 10
         label_btmleft[0] += label_width
         label_btmleft[1] += label_height
-        cv2.rectangle(image, tuple(bbox[:2]), tuple(label_btmleft), color=(255, 0, 0), thickness=cv2.FILLED)
-        cv2.putText(image, score_label, (bbox[0] + 5, label_btmleft[1] - 5),
-                    cv2.FONT_HERSHEY_SIMPLEX, fontScale=1.0, color=(255, 255, 255), thickness=2)
+        #cv2.rectangle(image, tuple(bbox[:2]), tuple(label_btmleft), color=(255, 0, 0), thickness=cv2.FILLED)
+        #cv2.putText(image, score_label, (bbox[0] + 5, label_btmleft[1] - 5),
+                    #cv2.FONT_HERSHEY_SIMPLEX, fontScale=1.0, color=(255, 255, 255), thickness=2)
     return image
 
 # detect single frame
@@ -111,8 +132,8 @@ def main(image):
         right_eye_img = padded[right_box[0][1]:right_box[1][1], right_box[0][0]:right_box[1][0]]
         left_eye_landmarks, left_iris_landmarks = eye_mesher.inference(left_eye_img)
         right_eye_landmarks, right_iris_landmarks = eye_mesher.inference(right_eye_img)
-        #cv2.rectangle(image_show, left_box[0], left_box[1], color=(255, 0, 0), thickness=2)
-        #cv2.rectangle(image_show, right_box[0], right_box[1], color=(255, 0, 0), thickness=2)
+        cv2.rectangle(image_show, left_box[0], left_box[1], color=(255, 0, 0), thickness=2)
+        cv2.rectangle(image_show, right_box[0], right_box[1], color=(255, 0, 0), thickness=2)
         left_eye_ratio = get_eye_ratio(left_eye_landmarks, image_show, left_box[0])
         right_eye_ratio = get_eye_ratio(right_eye_landmarks, image_show, right_box[0])
 
@@ -155,20 +176,32 @@ def main(image):
 
 
 # endless loop
+window_name = "EiQ DMS demo"
 while ret:
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    # detect single
-    image_show = main(image)
-
-    # put fps
-    result = cv2.cvtColor(image_show, cv2.COLOR_RGB2BGR)
-
-    # display the result
-    cv2.imshow('demo', result)
-
-    ret, image = cap.read()
-    if cv2.waitKey(1) & 0xFF == ord('q'):
-        break
+    try:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # detect single
+        image_show = main(image)
+
+        # put fps
+        result = cv2.cvtColor(image_show, cv2.COLOR_RGB2BGR)
+
+        cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
+        if args.fullscreen:
+            cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
+
+        # display the result
+        cv2.imshow(window_name, result)
+
+        ret, image = cap.read()
+        if FLIP is not None:
+            image = cv2.flip(image, FLIP)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+    except Exception as err:
+        # Ignore exceptions
+        time.sleep(0.2)
+        print("Exception catched:%s\n... continuing with test" % repr(err))
 
 time.sleep(2)
 cap.release()
diff --git a/face_recognition/main.py b/face_recognition/main.py
index acc838e..33ffa71 100644
--- a/face_recognition/main.py
+++ b/face_recognition/main.py
@@ -13,6 +13,11 @@ from face_detection import YoloFace
 from face_recognition import Facenet
 from face_database import FaceDatabase
 
+WIDTH=640
+HEIGH=480
+FLIP=None    # None, skip, 0: Flip vertically, 1: Flip horizontally (around the y-axis), -1: Flip both vertically and horizontally
+FORMAT=0     # None, skip (YUYV, default), 0 MJPG (for usb camera)
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     '-i',
@@ -22,12 +27,14 @@ parser.add_argument(
 parser.add_argument(
     '-d',
     '--delegate',
-    default='',
+    default='/usr/lib/libethosu_delegate.so',
     help='delegate path')
+parser.add_argument("-f", "--fullscreen", action="store_true", help='run on full screen mode')
 args = parser.parse_args()
 
-detector = YoloFace("../models/yoloface_int8.tflite", args.delegate)
-recognizer = Facenet("../models/facenet_512_int_quantized.tflite", args.delegate)
+# Always enforce the Ethos NPU, use the converted vela models
+detector = YoloFace("../vela_models/yoloface_int8_vela.tflite", args.delegate)
+recognizer = Facenet("../vela_models/facenet_512_int_quantized_vela.tflite", args.delegate)
 database = FaceDatabase()
 
 def ischar(c):
@@ -39,7 +46,7 @@ def get_inputs(img, msg):
         cv2.rectangle(img, (0, 0), (img.shape[1], 40), (0, 0, 0), -1)
         cv2.putText(img, msg + inputs, (30, 30),
                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
-        cv2.imshow('img', img)
+        cv2.imshow(window_name, img)
         key = cv2.waitKey(20) & 0xFF
         if key == 13 or key == 141:
             break
@@ -68,13 +75,28 @@ if args.input.isdigit():
     cap_input = int(args.input)
 else:
     cap_input = args.input
+
+# This pipeline for the OV5640 camera in case the other command fails
+# vid = cv2.VideoCapture("v4l2src device=%s ! imxvideoconvert_pxp ! video/x-raw,format=RGB16,width=%d,height=%d " \
+#                         "! videoconvert ! appsink" % (args.input, WIDTH, HEIGH))
 vid = cv2.VideoCapture(cap_input)
+vid.set(cv2.CAP_PROP_FRAME_WIDTH, WIDTH)
+vid.set(cv2.CAP_PROP_FRAME_HEIGHT, HEIGH)
+
+if FORMAT == 0:
+    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+    vid.set(cv2.CAP_PROP_FOURCC, fourcc)
+
 PADDING = 10
 tips = "Press 'a' to add person, 'd' to delete person, 'p' to print database"
+
+window_name = "Face recognition Demo"
 while True:
     embeddings = None
 
     ret, img = vid.read()
+    if FLIP is not None:
+        img = cv2.flip(img, FLIP)
     if (ret == False):
         break
     boxes = detector.detect(img)
@@ -97,12 +119,20 @@ while True:
 
     cv2.putText(img, tips, (30, 30),
                 cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 3)
-    cv2.imshow('img', img)
+
+    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
+    if args.fullscreen:
+        cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
+
+    cv2.imshow(window_name, img)
     key = cv2.waitKey(1) & 0xFF
     if (key == ord('a')):
         msg = "ADD. Please input name:"
         name = get_inputs(img, msg)
-        database.add_name(name, embeddings)
+        if embeddings:
+            database.add_name(name, embeddings)
+        else:
+            print ("Not a valid face, not adding user to database, ignoring...")
     elif (key == ord('d')):
         msg = "DEL. Please input name:"
         name = get_inputs(img, msg)
@@ -110,7 +140,7 @@ while True:
     elif (key == ord('p')):
         names = ",".join(database.get_names())
         print_longtext(img, names + "   Press any key to continue.")
-        cv2.imshow('img', img)
+        cv2.imshow(window_name, img)
         while cv2.waitKey(100) & 0xFF == 0xFF:
             pass
 
diff --git a/gesture_detection/main.py b/gesture_detection/main.py
index da83ce0..15b8597 100644
--- a/gesture_detection/main.py
+++ b/gesture_detection/main.py
@@ -9,8 +9,9 @@ import time
 import argparse
 from hand_tracker import HandTracker
 
-PALM_MODEL_PATH = "../models/palm_detection_builtin_256_integer_quant.tflite"
-LANDMARK_MODEL_PATH = "../models/hand_landmark_3d_256_integer_quant.tflite"
+# Always enforce the Ethos NPU, use the converted vela models
+PALM_MODEL_PATH = "../vela_models/palm_detection_builtin_256_integer_quant_vela.tflite"
+LANDMARK_MODEL_PATH = "../vela_models/hand_landmark_3d_256_integer_quant_vela.tflite"
 ANCHORS_PATH = "anchors.csv"
 
 def draw_landmarks(points, frame):
@@ -52,15 +53,33 @@ parser.add_argument(
 parser.add_argument(
     '-d',
     '--delegate',
-    default='',
+    default='/usr/lib/libethosu_delegate.so',
     help='delegate path')
+parser.add_argument("-f", "--fullscreen", action="store_true", help='run on full screen mode')
 args = parser.parse_args()
 
 if args.input.isdigit():
     cap_input = int(args.input)
 else:
     cap_input = args.input
+
+WIDTH=640
+HEIGH=480
+FLIP=None    # None, skip, 0: Flip vertically, 1: Flip horizontally (around the y-axis), -1: Flip both vertically and horizontally
+FORMAT=0     # None, skip (YUYV, default), 0 MJPG (for usb camera)
+
+# This pipeline for the OV5640 camera in case the other command fails
+# capture = cv2.VideoCapture("v4l2src device=%s ! imxvideoconvert_pxp ! video/x-raw,format=RGB16,width=%d,height=%d " \
+#                         "! videoconvert ! appsink" % (args.input, WIDTH, HEIGH))
+
 capture = cv2.VideoCapture(cap_input)
+capture.set(cv2.CAP_PROP_FRAME_WIDTH, WIDTH)
+capture.set(cv2.CAP_PROP_FRAME_HEIGHT, HEIGH)
+
+if FORMAT == 0:
+    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+    capture.set(cv2.CAP_PROP_FOURCC, fourcc)
+
 ret, frame = capture.read()
 if (frame is None):
     print("Can't read frame from source file ", args.input)
@@ -68,11 +87,16 @@ if (frame is None):
 
 detector = HandTracker(PALM_MODEL_PATH, LANDMARK_MODEL_PATH, ANCHORS_PATH, args.delegate, box_shift=0.2, box_enlarge=1.3)
 
+window_name = "Hand Gesture Demo"
 while ret:
     image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
     points, _ = detector(image)
     draw_landmarks(points, frame)
-    cv2.imshow("hand", frame)
+
+    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
+    if args.fullscreen:
+        cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
+    cv2.imshow(window_name, frame)
 
     ret, frame = capture.read()
     if cv2.waitKey(1) & 0xFF == ord('q'):
diff --git a/object_detection/main.py b/object_detection/main.py
index 1356111..efa614e 100644
--- a/object_detection/main.py
+++ b/object_detection/main.py
@@ -13,7 +13,13 @@ import argparse
 
 from labels import label2string
 
-MODEL_PATH = "../models/ssd_mobilenet_v1_quant.tflite"
+WIDTH=640
+HEIGH=480
+FLIP=None    # None, skip, 0: Flip vertically, 1: Flip horizontally (around the y-axis), -1: Flip both vertically and horizontally
+FORMAT=0     # None, skip (YUYV, default), 0 MJPG (for usb camera)
+
+# Always enforce the Ethos NPU, use the converted vela models
+MODEL_PATH = "../vela_models/ssd_mobilenet_v1_quant_vela.tflite"
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -24,21 +30,31 @@ parser.add_argument(
 parser.add_argument(
     '-d',
     '--delegate',
-    default='',
+    default='/usr/lib/libethosu_delegate.so',
     help='delegate path')
+parser.add_argument("-f", "--fullscreen", action="store_true", help='run on full screen mode')
 args = parser.parse_args()
 
 if args.input.isdigit():
     cap_input = int(args.input)
 else:
     cap_input = args.input
+
+# This pipeline for the OV5640 camera in case the other command fails
+# vid = cv2.VideoCapture("v4l2src device=%s ! imxvideoconvert_pxp ! video/x-raw,format=RGB16,width=%d,height=%d " \
+#                         "! videoconvert ! appsink" % (args.input, WIDTH, HEIGH))
+
 vid = cv2.VideoCapture(cap_input)
+vid.set(cv2.CAP_PROP_FRAME_WIDTH, WIDTH)
+vid.set(cv2.CAP_PROP_FRAME_HEIGHT, HEIGH)
 
-if(args.delegate):
-    ext_delegate = [tflite.load_delegate(args.delegate)]
-    interpreter = tflite.Interpreter(model_path=MODEL_PATH, experimental_delegates=ext_delegate)
-else:
-    interpreter = tflite.Interpreter(model_path=MODEL_PATH)
+if FORMAT == 0:
+    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+    vid.set(cv2.CAP_PROP_FOURCC, fourcc)
+
+# Always enforce the Ethos NPU
+ext_delegate = [tflite.load_delegate(args.delegate)]
+interpreter = tflite.Interpreter(model_path=MODEL_PATH, experimental_delegates=ext_delegate)
 interpreter.allocate_tensors()
 
 input_details = interpreter.get_input_details()
@@ -52,10 +68,13 @@ total_fps = 0
 total_time = 0
 
 ret, frame = vid.read()
+if FLIP is not None:
+    frame = cv2.flip(frame, FLIP)
 if (frame is None):
     print("Can't read frame from source file ", args.input)
     exit(0)
 
+window_name = "Object Detection Demo"
 while ret:
     total_fps += 1
     loop_start = time.time()
@@ -94,9 +113,14 @@ while ret:
     msg = "FPS:" + str(fps) + "  Invoke time:" + str(invoke_time) + "ms"
     cv2.putText(frame, msg, (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 3)
 
-    cv2.imshow("image", frame)
+    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
+    if args.fullscreen:
+        cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
+    cv2.imshow(window_name, frame)
 
     ret, frame = vid.read()
+    if FLIP is not None:
+        frame = cv2.flip(frame, FLIP)
     if cv2.waitKey(1) & 0xFF == ord('q'):
         break