From 8734a6c2e0db4fff6b244251db9e519cfed7ae34 Mon Sep 17 00:00:00 2001
From: henryruhs <info@henryruhs.com>
Date: Sat, 3 Jun 2023 02:23:48 +0200
Subject: [PATCH] Follow ONNX_Runtime_Perf_Tuning and introduce new args

---
 README.md        | 11 +++++----
 requirements.txt |  1 +
 roop/core.py     | 58 +++++++++++++++++++++---------------------------
 roop/globals.py  |  5 ++++-
 roop/swapper.py  | 10 ++++++++-
 roop/utils.py    |  4 ++--
 6 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index af29800..2c22d06 100644
--- a/README.md
+++ b/README.md
@@ -40,14 +40,17 @@ options:
                         replace this face
   -o OUTPUT_FILE, --output OUTPUT_FILE
                         save output to this file
-  --gpu                 use gpu
   --keep-fps            maintain original fps
   --keep-frames         keep frames directory
+  --all-faces           swap all faces in frame
   --max-memory MAX_MEMORY
                         maximum amount of RAM in GB to be used
-  --max-cores CORES_COUNT
-                        number of cores to be use for CPU mode
-  --all-faces           swap all faces in frame
+  --cpu-threads CPU_THREADS
+                        number of threads to be use for CPU mode
+  --gpu-threads GPU_THREADS
+                        number of threads to be use for GPU moded
+  --gpu-vendor {amd,intel,nvidia}
+                        choice your gpu vendor
 ```
 
 Looking for a CLI mode? Using the -f/--face argument will make the program in cli mode.
diff --git a/requirements.txt b/requirements.txt
index eaccae0..3ffcf50 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ tensorflow==2.12.0; sys_platform != 'darwin'
 opennsfw2==0.10.2
 protobuf==4.23.2
 tqdm==4.65.0
+threadpoolctl==3.1.0
\ No newline at end of file
diff --git a/roop/core.py b/roop/core.py
index 371191d..5b5612a 100644
--- a/roop/core.py
+++ b/roop/core.py
@@ -6,7 +6,6 @@ import sys
 import shutil
 import glob
 import argparse
-import multiprocessing as mp
 import os
 import torch
 from pathlib import Path
@@ -15,9 +14,9 @@ from tkinter import filedialog
 from opennsfw2 import predict_video_frames, predict_image
 from tkinter.filedialog import asksaveasfilename
 import webbrowser
-import psutil
 import cv2
 import threading
+from threadpoolctl import threadpool_limits
 from PIL import Image, ImageTk
 
 import roop.globals
@@ -28,30 +27,35 @@ from roop.analyser import get_face_single
 if 'ROCMExecutionProvider' in roop.globals.providers:
     del torch
 
-pool = None
-args = {}
-
 signal.signal(signal.SIGINT, lambda signal_number, frame: quit())
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--face', help='use this face', dest='source_img')
 parser.add_argument('-t', '--target', help='replace this face', dest='target_path')
 parser.add_argument('-o', '--output', help='save output to this file', dest='output_file')
-parser.add_argument('--gpu', help='choice your gpu vendor', dest='gpu', choices=['amd', 'nvidia'])
 parser.add_argument('--keep-fps', help='maintain original fps', dest='keep_fps', action='store_true', default=False)
 parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False)
-parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', type=int)
-parser.add_argument('--max-cores', help='number of cores to be use for CPU mode', dest='cores_count', type=int, default=max(psutil.cpu_count() - 2, 2))
 parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False)
+parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int)
+parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int)
+parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int)
+parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['amd', 'intel', 'nvidia'])
 
+args = {}
 for name, value in vars(parser.parse_args()).items():
     args[name] = value
 
-if 'gpu' in args:
-    roop.globals.gpu = args['gpu']
-
-if 'all-faces' in args:
+if 'all_faces' in args:
     roop.globals.all_faces = True
 
+if 'cpu_threads' in args and args['cpu_threads']:
+    roop.globals.cpu_threads = args['cpu_threads']
+
+if 'gpu_threads' in args and args['gpu_threads']:
+    roop.globals.gpu_threads = args['gpu_threads']
+
+if 'gpu_vendor' in args and args['gpu_vendor']:
+    roop.globals.gpu_vendor = args['gpu_vendor']
+
 sep = "/"
 if os.name == "nt":
     sep = "\\"
@@ -77,10 +81,10 @@ def pre_check():
     model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx')
     if not os.path.isfile(model_path):
         quit('File "inswapper_128.onnx" does not exist!')
-    if roop.globals.gpu == 'amd':
+    if roop.globals.gpu_vendor == 'amd':
         if 'ROCMExecutionProvider' not in roop.globals.providers:
             quit("You are using --gpu=amd flag but ROCM isn't available or properly installed on your system.")
-    if roop.globals.gpu == 'nvidia':
+    if roop.globals.gpu_vendor == 'nvidia':
         CUDA_VERSION = torch.version.cuda
         CUDNN_VERSION = torch.backends.cudnn.version()
         if not torch.cuda.is_available() or not CUDA_VERSION:
@@ -98,22 +102,14 @@ def pre_check():
 
 
 def start_processing():
-    frame_paths = args["frame_paths"]
-    n = len(frame_paths) // (args['cores_count'])
-    # single thread
-    if roop.globals.gpu == 'amd' or roop.globals.gpu == 'nvidia' or n < 2:
+    # gpu mode
+    if roop.globals.gpu_vendor is not None:
+        process_video(args['source_img'], args["frame_paths"])
+        return
+    # cpu mode
+    with threadpool_limits(limits=roop.globals.cpu_threads):
         process_video(args['source_img'], args["frame_paths"])
         return
-    # multithread if total frames to cpu cores ratio is greater than 2
-    if n > 2:
-        processes = []
-        for i in range(0, len(frame_paths), n):
-            p = pool.apply_async(process_video, args=(args['source_img'], frame_paths[i:i+n],))
-            processes.append(p)
-        for p in processes:
-            p.get()
-        pool.close()
-        pool.join()
 
 
 def preview_image(image_path):
@@ -194,8 +190,6 @@ def start():
     if not args['output_file']:
         target_path = args['target_path']
         args['output_file'] = rreplace(target_path, "/", "/swapped-", 1) if "/" in target_path else "swapped-" + target_path
-    global pool
-    pool = mp.Pool(args['cores_count'])
     target_path = args['target_path']
     test_face = get_face_single(cv2.imread(args['source_img']))
     if not test_face:
@@ -241,10 +235,8 @@ def start():
 
 def run():
     global all_faces, keep_frames, limit_fps, status_label, window
-
     pre_check()
     limit_resources()
-
     if args['source_img']:
         args['cli_mode'] = True
         start()
@@ -291,4 +283,4 @@ def run():
     status_label = tk.Label(window, width=580, justify="center", text="Status: waiting for input...", fg="#2ecc71", bg="#2d3436")
     status_label.place(x=10,y=640,width=580,height=30)
 
-    window.mainloop()
\ No newline at end of file
+    window.mainloop()
diff --git a/roop/globals.py b/roop/globals.py
index 34adafd..1c1bc49 100644
--- a/roop/globals.py
+++ b/roop/globals.py
@@ -1,8 +1,11 @@
 import onnxruntime
+import psutil
 
-gpu = None
 all_faces = False
 log_level = 'error'
+cpu_threads = max(psutil.cpu_count() - 2, 2)
+gpu_threads = 8
+gpu_vendor = None
 providers = onnxruntime.get_available_providers()
 
 if 'TensorrtExecutionProvider' in providers:
diff --git a/roop/swapper.py b/roop/swapper.py
index bfc4d63..90b6b3e 100644
--- a/roop/swapper.py
+++ b/roop/swapper.py
@@ -4,6 +4,7 @@ import cv2
 import insightface
 import roop.globals
 from roop.analyser import get_face_single, get_face_many
+import onnxruntime
 
 FACE_SWAPPER = None
 
@@ -11,8 +12,15 @@ FACE_SWAPPER = None
 def get_face_swapper():
     global FACE_SWAPPER
     if FACE_SWAPPER is None:
+        session_options = onnxruntime.SessionOptions()
+        if roop.globals.gpu_vendor is not None:
+            session_options.intra_op_num_threads = roop.globals.gpu_threads
+            session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
+        else:
+            session_options.enable_cpu_mem_arena = True
+        session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
         model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx')
-        FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers)
+        FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers, session_options=session_options)
     return FACE_SWAPPER
 
 
diff --git a/roop/utils.py b/roop/utils.py
index a9b0d05..d63807c 100644
--- a/roop/utils.py
+++ b/roop/utils.py
@@ -43,13 +43,13 @@ def set_fps(input_path, output_path, fps):
 
 
 def create_video(video_name, fps, output_dir):
-    hwaccel_option = '-hwaccel cuda' if roop.globals.gpu == 'nvidia' else ''
+    hwaccel_option = '-hwaccel cuda' if roop.globals.gpu_vendor == 'nvidia' else ''
     output_dir = path(output_dir)
     run_ffmpeg(f'{hwaccel_option} -framerate "{fps}" -i "{output_dir}{sep}%04d.png" -c:v libx264 -crf 7 -pix_fmt yuv420p -y "{output_dir}{sep}output.mp4"')
 
 
 def extract_frames(input_path, output_dir):
-    hwaccel_option = '-hwaccel cuda' if roop.globals.gpu == 'nvidia' else ''
+    hwaccel_option = '-hwaccel cuda' if roop.globals.gpu_vendor == 'nvidia' else ''
     input_path, output_dir = path(input_path), path(output_dir)
     run_ffmpeg(f' {hwaccel_option} -i "{input_path}" "{output_dir}{sep}%04d.png"')