From f200b4c7b446ea43f769f8a78acec1319e01b300 Mon Sep 17 00:00:00 2001
From: Pikachu~~~ <Moeblack@users.noreply.github.com>
Date: Sun, 4 Jun 2023 19:49:27 +0800
Subject: [PATCH] Roop-multi changed the implementation of multi-threading
 processing for nvidia GPU. (#317)

* changed the multi-thread implementation for nvidia gpu

* Update requirements.txt

* Add files via upload

* fix core.py and swapper.py

* fix core.py

* code clean

* code clean

* doubles performance of gpu-mode

---------

Co-authored-by: Moeblack <Moeblack@kuroinekorachi@gmail.com>
Co-authored-by: Somdev Sangwan <s0md3v@gmail.com>
---
 requirements.txt |   2 +-
 roop/analyser.py |   7 --
 roop/core.py     |   9 +--
 roop/globals.py  |   7 +-
 roop/swapper.py  | 169 +++++++++++++++++++++++++++--------------------
 5 files changed, 106 insertions(+), 88 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f51fc17..0897eb9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,4 @@ tensorflow==2.12.0; sys_platform != 'darwin'
 opennsfw2==0.10.2
 protobuf==4.23.2
 pynvml==11.5.0
-tqdm==4.65.0
+tqdm==4.65.0
\ No newline at end of file
diff --git a/roop/analyser.py b/roop/analyser.py
index 716af3b..804f7a8 100644
--- a/roop/analyser.py
+++ b/roop/analyser.py
@@ -1,5 +1,4 @@
 import insightface
-import onnxruntime
 import roop.globals
 
 FACE_ANALYSER = None
@@ -8,12 +7,6 @@ FACE_ANALYSER = None
 def get_face_analyser():
     global FACE_ANALYSER
     if FACE_ANALYSER is None:
-        session_options = onnxruntime.SessionOptions()
-        if roop.globals.gpu_vendor is not None:
-            session_options.intra_op_num_threads = roop.globals.gpu_threads
-        else:
-            session_options.intra_op_num_threads = roop.globals.cpu_threads
-        session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
         FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.providers)
         FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640))
     return FACE_ANALYSER
diff --git a/roop/core.py b/roop/core.py
index ad0c2f4..b32dc32 100755
--- a/roop/core.py
+++ b/roop/core.py
@@ -10,6 +10,7 @@ import signal
 import shutil
 import glob
 import argparse
+import psutil
 import torch
 from pathlib import Path
 from opennsfw2 import predict_video_frames, predict_image
@@ -33,11 +34,12 @@ parser.add_argument('--keep-fps', help='maintain original fps', dest='keep_fps',
 parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False)
 parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False)
 parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int)
-parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int)
-parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int)
+parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int, default=max(psutil.cpu_count() - 2, 2))
+parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int, default=4)
 parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia'])
 
 args = {}
+
 for name, value in vars(parser.parse_args()).items():
     args[name] = value
 
@@ -218,8 +220,7 @@ def save_file_handler(path: str):
 def create_test_preview(frame_number):
     return process_faces(
         get_face_single(cv2.imread(args['source_img'])), 
-        get_video_frame(args['target_path'], frame_number),
-        None
+        get_video_frame(args['target_path'], frame_number)
     )
 
 
diff --git a/roop/globals.py b/roop/globals.py
index 1c1bc49..da3cfac 100644
--- a/roop/globals.py
+++ b/roop/globals.py
@@ -1,10 +1,9 @@
 import onnxruntime
-import psutil
 
-all_faces = False
+all_faces = None
 log_level = 'error'
-cpu_threads = max(psutil.cpu_count() - 2, 2)
-gpu_threads = 8
+cpu_threads = None
+gpu_threads = None
 gpu_vendor = None
 providers = onnxruntime.get_available_providers()
 
diff --git a/roop/swapper.py b/roop/swapper.py
index 9f25b46..de00920 100644
--- a/roop/swapper.py
+++ b/roop/swapper.py
@@ -1,72 +1,97 @@
-import os
-from tqdm import tqdm
-import torch
-import onnxruntime
-import cv2
-import insightface
-
-import roop.globals
-from roop.analyser import get_face_single, get_face_many
-
-FACE_SWAPPER = None
-
-
-def get_face_swapper():
-    global FACE_SWAPPER
-    if FACE_SWAPPER is None:
-        session_options = onnxruntime.SessionOptions()
-        if roop.globals.gpu_vendor is not None:
-            session_options.intra_op_num_threads = roop.globals.gpu_threads
-        else:
-            session_options.intra_op_num_threads = roop.globals.cpu_threads
-        session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
-        model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx')
-        FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers, session_options=session_options)
-    return FACE_SWAPPER
-
-
-def swap_face_in_frame(source_face, target_face, frame):
-    if target_face:
-        return get_face_swapper().get(frame, target_face, source_face, paste_back=True)
-    return frame
-
-
-def process_faces(source_face, target_frame, progress):
-    if roop.globals.all_faces:
-        many_faces = get_face_many(target_frame)
-        if many_faces:
-            for face in many_faces:
-                target_frame = swap_face_in_frame(source_face, face, target_frame)
-    else:
-        face = get_face_single(target_frame)
-        if face:
-            target_frame = swap_face_in_frame(source_face, face, target_frame)
-    return target_frame
-
-
-def process_video(source_img, frame_paths, preview_callback):
-    source_face = get_face_single(cv2.imread(source_img))
-    progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
-
-    with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
-        for frame_path in frame_paths:
-            if roop.globals.gpu_vendor == 'nvidia':
-                progress.set_postfix(cuda_utilization="{:02d}%".format(torch.cuda.utilization()), cuda_memory="{:02d}GB".format(torch.cuda.memory_usage()))
-            frame = cv2.imread(frame_path)
-            try:
-                result = process_faces(source_face, frame, progress)
-                cv2.imwrite(frame_path, result)
-                if preview_callback:
-                    preview_callback(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))                
-            except Exception:
-                pass
-            progress.update(1)
-
-
-def process_img(source_img, target_path, output_file):
-    frame = cv2.imread(target_path)
-    face = get_face_single(frame)
-    source_face = get_face_single(cv2.imread(source_img))
-    result = get_face_swapper().get(frame, face, source_face, paste_back=True)
-    cv2.imwrite(output_file, result)
-    print("\n\nImage saved as:", output_file, "\n\n")
+
+import os
+from tqdm import tqdm
+import cv2
+import insightface
+import threading
+import roop.globals
+from roop.analyser import get_face_single, get_face_many
+
+FACE_SWAPPER = None
+THREAD_LOCK = threading.Lock()
+
+
+def get_face_swapper():
+    global FACE_SWAPPER
+    with THREAD_LOCK:
+        if FACE_SWAPPER is None:
+            model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx')
+            FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers)
+    return FACE_SWAPPER
+
+
+def swap_face_in_frame(source_face, target_face, frame):
+    if target_face:
+        return get_face_swapper().get(frame, target_face, source_face, paste_back=True)
+    return frame
+
+
+def process_faces(source_face, target_frame):
+    if roop.globals.all_faces:
+        many_faces = get_face_many(target_frame)
+        if many_faces:
+            for face in many_faces:
+                target_frame = swap_face_in_frame(source_face, face, target_frame)
+    else:
+        face = get_face_single(target_frame)
+        if face:
+            target_frame = swap_face_in_frame(source_face, face, target_frame)
+    return target_frame
+
+
+def process_frames(source_face, frame_paths, progress):
+    for frame_path in frame_paths:
+        frame = cv2.imread(frame_path)
+        try:
+            result = process_faces(source_face, frame)
+            cv2.imwrite(frame_path, result)
+        except Exception:
+            pass
+        progress.update(1)
+
+
+def multi_process_frame(source_face,frame_paths,progress):
+
+    # caculate the number of frames each threads processed
+    num_threads = roop.globals.gpu_threads
+    num_frames_per_thread = len(frame_paths) // num_threads
+    remaining_frames = len(frame_paths) % num_threads
+    
+    # initialize thread list
+    threads = []
+            
+    # create thread and launch
+    start_index = 0
+    for _ in range(num_threads):
+        end_index = start_index + num_frames_per_thread
+        if remaining_frames > 0:
+            end_index += 1
+            remaining_frames -= 1
+        thread_frame_paths = frame_paths[start_index:end_index]
+        thread = threading.Thread(target=process_frames, args=(source_face, thread_frame_paths, progress))
+        threads.append(thread)
+        thread.start()
+        start_index = end_index
+
+    # threading
+    for thread in threads:
+        thread.join()
+
+
+def process_img(source_img, target_path, output_file):
+    frame = cv2.imread(target_path)
+    face = get_face_single(frame)
+    source_face = get_face_single(cv2.imread(source_img))
+    result = get_face_swapper().get(frame, face, source_face, paste_back=True)
+    cv2.imwrite(output_file, result)
+    print("\n\nImage saved as:", output_file, "\n\n")
+
+
+def process_video(source_img, frame_paths, preview_callback):
+    source_face = get_face_single(cv2.imread(source_img))
+    progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
+    with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
+        if roop.globals.gpu_vendor is not None:
+            multi_process_frame(source_face,frame_paths,progress)
+        else:
+            process_frames(source_img, frame_paths, progress)