From 641a0dcb535a1a83195363ccf065acf13cc71065 Mon Sep 17 00:00:00 2001
From: silencht <silencht@qq.com>
Date: Mon, 27 Oct 2025 21:07:03 +0800
Subject: [PATCH] [update] fit xr-teleoperate proj, add pass-through mode.

---
 src/televuer/televuer.py   | 253 ++++++++++++++++++++-----------------
 src/televuer/tv_wrapper.py |  38 +++---
 2 files changed, 161 insertions(+), 130 deletions(-)

diff --git a/src/televuer/televuer.py b/src/televuer/televuer.py
index af67639..c855012 100644
--- a/src/televuer/televuer.py
+++ b/src/televuer/televuer.py
@@ -10,66 +10,74 @@ from pathlib import Path
 
 
 class TeleVuer:
-    def __init__(self, binocular: bool, use_hand_tracking: bool, img_shape, cert_file=None, key_file=None, ngrok=False, 
-                use_image = True, webrtc=False):
+    def __init__(self, use_hand_tracking: bool, pass_through:bool=False, binocular: bool=True, img_shape: tuple=None, 
+                       cert_file=None, key_file=None, webrtc: bool=False, webrtc_url: str=None):
         """
         TeleVuer class for OpenXR-based XR teleoperate applications.
-        This class handles the communication with the Vuer server and manages the shared memory for image and pose data.
+        This class handles the communication with the Vuer server and manages image and pose data.
 
-        :param binocular: bool, whether the application is binocular (stereoscopic) or monocular.
         :param use_hand_tracking: bool, whether to use hand tracking or controller tracking.
-        :param img_shape: tuple, shape of the image (height, width).
+        :param pass_through: bool, controls the VR viewing mode.
+
+            Note:
+            - if pass_through is True, the XR user will see the real world through the VR headset cameras.
+            - if pass_through is False, the XR user will see the images provided by webrtc or render_to_xr method:
+            - webrtc is prior to render_to_xr. if webrtc is True, the class will use webrtc for image transmission.
+            - if webrtc is False, the class will use render_to_xr for image transmission.
+    
+        :param binocular: bool, whether the application is binocular (stereoscopic) or monocular.
+        :param img_shape: tuple, shape of the head image (height, width).
         :param cert_file: str, path to the SSL certificate file.
         :param key_file: str, path to the SSL key file.
-        :param ngrok: bool, whether to use ngrok for tunneling.
-        :param use_image: bool, whether to use image streaming: ImageBackground or WebRTCVideoPlane.
-                          if False, no image stream is used.
         :param webrtc: bool, whether to use WebRTC for real-time communication. if False, use ImageBackground.
+        :param webrtc_url: str, URL for the WebRTC offer.
         """
-        self.binocular = binocular
         self.use_hand_tracking = use_hand_tracking
-        self.img_shape = (img_shape[0], img_shape[1], 3)
-        self.img2display_shm = shared_memory.SharedMemory(create=True, size=np.prod(self.img_shape) * np.uint8().itemsize)
-        self.img2display = np.ndarray(self.img_shape, dtype=np.uint8, buffer=self.img2display_shm.buf)
-
-        self.latest_frame = None
-        self.new_frame_event = threading.Event()
-        self.writer_thread = threading.Thread(target=self._img2display_writer, daemon=True)
-        self.writer_thread.start()
+        self.pass_through = pass_through
 
+        self.binocular = binocular
+        self.img_shape = (img_shape[0], img_shape[1], 3)
         self.img_height = self.img_shape[0]
         if self.binocular:
             self.img_width  = self.img_shape[1] // 2
         else:
             self.img_width  = self.img_shape[1]
-        
+        self.aspect_ratio = self.img_width / self.img_height
+
         current_module_dir = Path(__file__).resolve().parent.parent.parent
         if cert_file is None:
             cert_file = os.path.join(current_module_dir, "cert.pem")
         if key_file is None:
             key_file = os.path.join(current_module_dir, "key.pem")
 
-        if ngrok:
-            self.vuer = Vuer(host='0.0.0.0', queries=dict(grid=False), queue_len=3)
-        else:
-            self.vuer = Vuer(host='0.0.0.0', cert=cert_file, key=key_file, queries=dict(grid=False), queue_len=3)
-
+        self.vuer = Vuer(host='0.0.0.0', cert=cert_file, key=key_file, queries=dict(grid=False), queue_len=3)
         self.vuer.add_handler("CAMERA_MOVE")(self.on_cam_move)
         if self.use_hand_tracking:
             self.vuer.add_handler("HAND_MOVE")(self.on_hand_move)
         else:
             self.vuer.add_handler("CONTROLLER_MOVE")(self.on_controller_move)
-        self.use_image = use_image
+
         self.webrtc = webrtc
-        if self.use_image:
-            if self.binocular and not self.webrtc:
-                self.vuer.spawn(start=False)(self.main_image_binocular)
-            elif not self.binocular and not self.webrtc:
-                self.vuer.spawn(start=False)(self.main_image_monocular)
-            elif self.binocular and self.webrtc:
+        self.webrtc_url = webrtc_url
+
+        if self.webrtc:
+            if self.binocular:
                 self.vuer.spawn(start=False)(self.main_image_binocular_webrtc)
-            elif not self.binocular and self.webrtc:
+            else:
                 self.vuer.spawn(start=False)(self.main_image_monocular_webrtc)
+        else:
+            if self.pass_through is False:
+                self.img2display_shm = shared_memory.SharedMemory(create=True, size=np.prod(self.img_shape) * np.uint8().itemsize)
+                self.img2display = np.ndarray(self.img_shape, dtype=np.uint8, buffer=self.img2display_shm.buf)
+                self.latest_frame = None
+                self.new_frame_event = threading.Event()
+                self.stop_writer_event = threading.Event()
+                self.writer_thread = threading.Thread(target=self._xr_render_loop, daemon=True)
+                self.writer_thread.start()
+            if self.binocular:
+                 self.vuer.spawn(start=False)(self.main_image_binocular)
+            else:
+                 self.vuer.spawn(start=False)(self.main_image_monocular)
 
         self.head_pose_shared = Array('d', 16, lock=True)
         self.left_arm_pose_shared = Array('d', 16, lock=True)
@@ -119,25 +127,40 @@ class TeleVuer:
             pass
         except Exception as e:
             print(f"Vuer encountered an error: {e}")
-
-    def _img2display_writer(self):
-        while True:
-            self.new_frame_event.wait()
+        finally:
+            if hasattr(self, "stop_writer_event"):
+                self.stop_writer_event.set()
+
+    def _xr_render_loop(self):
+        while not self.stop_writer_event.is_set():
+            if not self.new_frame_event.wait(timeout=0.1):
+                continue
             self.new_frame_event.clear()
-            self.latest_frame = cv2.cvtColor(self.latest_frame, cv2.COLOR_BGR2RGB)
-            self.img2display[:] = self.latest_frame #.copy()
+            if self.latest_frame is None:
+                continue
+            latest_frame = self.latest_frame
+            latest_frame = cv2.cvtColor(latest_frame, cv2.COLOR_BGR2RGB)
+            self.img2display[:] = latest_frame
     
-    def set_display_image(self, image):
+    def render_to_xr(self, image):
+        if self.webrtc or self.pass_through:
+            print("[TeleVuer] Warning: render_to_xr is ignored when webrtc is enabled or pass_through is True.")
+            return
         self.latest_frame = image
         self.new_frame_event.set()
 
-    
     def close(self):
         self.process.terminate()
         self.process.join(timeout=0.5)
-        self.img2display_shm.close()
-        self.img2display_shm.unlink()
-        self.writer_thread.join(timeout=0.5)
+        if not self.webrtc and not self.pass_through:
+            self.stop_writer_event.set()
+            self.new_frame_event.set()
+            self.writer_thread.join(timeout=0.5)
+            try:
+                self.img2display_shm.close()
+                self.img2display_shm.unlink()
+            except:
+                pass
 
     async def on_cam_move(self, event, session, fps=60):
         try:
@@ -232,7 +255,7 @@ class TeleVuer:
         except:
             pass
     
-    async def main_image_binocular(self, session, fps=60):
+    async def main_image_binocular(self, session):
         if self.use_hand_tracking:
             session.upsert(
                 Hands(
@@ -254,41 +277,41 @@ class TeleVuer:
                 to="bgChildren",
             )
         while True:
-            aspect_ratio = self.img_width / self.img_height
-            session.upsert(
-                [
-                    ImageBackground(
-                        self.img2display[:, :self.img_width],
-                        aspect=aspect_ratio,
-                        height=1,
-                        distanceToCamera=1,
-                        # The underlying rendering engine supported a layer binary bitmask for both objects and the camera. 
-                        # Below we set the two image planes, left and right, to layers=1 and layers=2. 
-                        # Note that these two masks are associated with left eye’s camera and the right eye’s camera.
-                        layers=1,
-                        format="jpeg",
-                        quality=80,
-                        key="background-left",
-                        interpolate=True,
-                    ),
-                    ImageBackground(
-                        self.img2display[:, self.img_width:],
-                        aspect=aspect_ratio,
-                        height=1,
-                        distanceToCamera=1,
-                        layers=2,
-                        format="jpeg",
-                        quality=80,
-                        key="background-right",
-                        interpolate=True,
-                    ),
-                ],
-                to="bgChildren",
-            )
+            if self.pass_through is False:
+                session.upsert(
+                    [
+                        ImageBackground(
+                            self.img2display[:, :self.img_width],
+                            aspect=self.aspect_ratio,
+                            height=1,
+                            distanceToCamera=1,
+                            # The underlying rendering engine supported a layer binary bitmask for both objects and the camera. 
+                            # Below we set the two image planes, left and right, to layers=1 and layers=2. 
+                            # Note that these two masks are associated with left eye’s camera and the right eye’s camera.
+                            layers=1,
+                            format="jpeg",
+                            quality=80,
+                            key="background-left",
+                            interpolate=True,
+                        ),
+                        ImageBackground(
+                            self.img2display[:, self.img_width:],
+                            aspect=self.aspect_ratio,
+                            height=1,
+                            distanceToCamera=1,
+                            layers=2,
+                            format="jpeg",
+                            quality=80,
+                            key="background-right",
+                            interpolate=True,
+                        ),
+                    ],
+                    to="bgChildren",
+                )
             # 'jpeg' encoding should give you about 30fps with a 16ms wait in-between.
             await asyncio.sleep(0.016)
 
-    async def main_image_monocular(self, session, fps=60):
+    async def main_image_monocular(self, session):
         if self.use_hand_tracking:
             session.upsert(
                 Hands(
@@ -311,26 +334,25 @@ class TeleVuer:
             )
 
         while True:
-            aspect_ratio = self.img_width / self.img_height
-            session.upsert(
-                [
-                    ImageBackground(
-                        self.img2display,
-                        aspect=aspect_ratio,
-                        height=1,
-                        distanceToCamera=1,
-                        format="jpeg",
-                        quality=80,
-                        key="background-mono",
-                        interpolate=True,
-                    ),
-                ],
-                to="bgChildren",
-            )
+            if self.pass_through is False:
+                session.upsert(
+                    [
+                        ImageBackground(
+                            self.img2display,
+                            aspect=self.aspect_ratio,
+                            height=1,
+                            distanceToCamera=1,
+                            format="jpeg",
+                            quality=80,
+                            key="background-mono",
+                            interpolate=True,
+                        ),
+                    ],
+                    to="bgChildren",
+                )
             await asyncio.sleep(0.016)
 
     async def main_image_binocular_webrtc(self, session):
-        aspect_ratio = self.img_width / self.img_height
         if self.use_hand_tracking:
             session.upsert(
                 Hands(
@@ -350,22 +372,23 @@ class TeleVuer:
                     showRight=False,
                 )
             )
+
         while True:
-            session.upsert(
-                WebRTCStereoVideoPlane(
-                    src="https://127.0.0.1:60001/offer",
-                    iceServers=[],
-                    key="video-quad",
-                    aspect=aspect_ratio,
-                    height = 7,
-                ),
-                to="bgChildren",
-            )
+            if self.pass_through is False:
+                session.upsert(
+                    WebRTCStereoVideoPlane(
+                        src=self.webrtc_url,
+                        iceServers=[],
+                        key="video-quad",
+                        aspect=self.aspect_ratio,
+                        height = 7,
+                    ),
+                    to="bgChildren",
+                )
 
             await asyncio.sleep(0.016)
 
-    async def main_image_monocular_webrtc(self, session, fps=60):
-        aspect_ratio = self.img_width / self.img_height
+    async def main_image_monocular_webrtc(self, session):
         if self.use_hand_tracking:
             session.upsert(
                 Hands(
@@ -385,17 +408,19 @@ class TeleVuer:
                     showRight=False,
                 )
             )
+
         while True:
-            session.upsert(
-                WebRTCVideoPlane(
-                    src="https://127.0.0.1:60001/offer",
-                    iceServers=[],
-                    key="video-quad",
-                    aspect=aspect_ratio,
-                    height = 7,
-                ),
-                to="bgChildren",
-            )
+            if self.pass_through is False:
+                session.upsert(
+                    WebRTCVideoPlane(
+                        src=self.webrtc_url,
+                        iceServers=[],
+                        key="video-quad",
+                        aspect=self.aspect_ratio,
+                        height = 7,
+                    ),
+                    to="bgChildren",
+                )
 
             await asyncio.sleep(0.016)
     # ==================== common data ====================
diff --git a/src/televuer/tv_wrapper.py b/src/televuer/tv_wrapper.py
index 19ec4d3..6f1df44 100644
--- a/src/televuer/tv_wrapper.py
+++ b/src/televuer/tv_wrapper.py
@@ -193,27 +193,33 @@ class TeleData:
 
 
 class TeleVuerWrapper:
-    def __init__(self, binocular: bool, use_hand_tracking: bool, img_shape, return_hand_rot_data: bool = False,
-                       cert_file = None, key_file = None, ngrok = False, use_image = True, webrtc = False):
+    def __init__(self, use_hand_tracking: bool, pass_through: bool=False, binocular: bool=True, img_shape: tuple=(480, 640), 
+                       cert_file = None, key_file = None, webrtc: bool=True, webrtc_url: str=None, 
+                       return_hand_rot_data: bool=False):
         """
         TeleVuerWrapper is a wrapper for the TeleVuer class, which handles XR device's data suit for robot control.
         It initializes the TeleVuer instance with the specified parameters and provides a method to get motion state data.
 
-        :param binocular: A boolean indicating whether the head camera device is binocular or not.
-        :param use_hand_tracking: A boolean indicating whether to use hand tracking or use controller tracking.
-        :param img_shape: The shape of the image to be processed.
-        :param return_hand_rot_data: A boolean indicating whether to return the hand rotation data.
-        :param cert_file: The path to the certificate file for secure connection.
-        :param key_file: The path to the key file for secure connection.
-        :param ngrok: A boolean indicating whether to use ngrok for remote access.
-        :param use_image: A boolean indicating whether to use image streaming: ImageBackground or WebRTCVideoPlane.
-                          if False, no image stream is used.
-        :param webrtc: A boolean indicating whether to use WebRTC for real-time communication. if False, use ImageBackground.
+        :param use_hand_tracking: bool, whether to use hand tracking or controller tracking.
+        :param pass_through: bool, controls the VR viewing mode.
+
+            Note:
+            - if pass_through is True, the XR user will see the real world through the VR headset cameras.
+            - if pass_through is False, the XR user will see the images provided by webrtc or render_to_xr method:
+            - webrtc is prior to render_to_xr. if webrtc is True, the class will use webrtc for image transmission.
+            - if webrtc is False, the class will use render_to_xr for image transmission.
+    
+        :param binocular: bool, whether the application is binocular (stereoscopic) or monocular.
+        :param img_shape: tuple, shape of the head image (height, width).
+        :param cert_file: str, path to the SSL certificate file.
+        :param key_file: str, path to the SSL key file.
+        :param webrtc: bool, whether to use WebRTC for real-time communication. if False, use ImageBackground.
+        :param webrtc_url: str, URL for the WebRTC offer.
         """
         self.use_hand_tracking = use_hand_tracking
         self.return_hand_rot_data = return_hand_rot_data
-        self.tvuer = TeleVuer(binocular, use_hand_tracking, img_shape, cert_file=cert_file, key_file=key_file,
-                              ngrok=ngrok, use_image=use_image, webrtc=webrtc)
+        self.tvuer = TeleVuer(use_hand_tracking, pass_through, binocular, img_shape, cert_file=cert_file, key_file=key_file,
+                              webrtc=webrtc, webrtc_url=webrtc_url)
     
     def get_tele_data(self):
         """
@@ -407,8 +413,8 @@ class TeleVuerWrapper:
                 right_ctrl_thumbstickValue=self.tvuer.right_ctrl_thumbstickValue,
             )
         
-    def set_display_image(self, img):
-        self.tvuer.set_display_image(img)
+    def render_to_xr(self, img):
+        self.tvuer.render_to_xr(img)
     
     def close(self):
         self.tvuer.close()
\ No newline at end of file