From 8734a6c2e0db4fff6b244251db9e519cfed7ae34 Mon Sep 17 00:00:00 2001 From: henryruhs Date: Sat, 3 Jun 2023 02:23:48 +0200 Subject: [PATCH] Follow ONNX_Runtime_Perf_Tuning and introduce new args --- README.md | 11 +++++---- requirements.txt | 1 + roop/core.py | 58 +++++++++++++++++++++--------------------------- roop/globals.py | 5 ++++- roop/swapper.py | 10 ++++++++- roop/utils.py | 4 ++-- 6 files changed, 48 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index af29800..2c22d06 100644 --- a/README.md +++ b/README.md @@ -40,14 +40,17 @@ options: replace this face -o OUTPUT_FILE, --output OUTPUT_FILE save output to this file - --gpu use gpu --keep-fps maintain original fps --keep-frames keep frames directory + --all-faces swap all faces in frame --max-memory MAX_MEMORY maximum amount of RAM in GB to be used - --max-cores CORES_COUNT - number of cores to be use for CPU mode - --all-faces swap all faces in frame + --cpu-threads CPU_THREADS + number of threads to be use for CPU mode + --gpu-threads GPU_THREADS + number of threads to be use for GPU moded + --gpu-vendor {amd,intel,nvidia} + choice your gpu vendor ``` Looking for a CLI mode? Using the -f/--face argument will make the program in cli mode. diff --git a/requirements.txt b/requirements.txt index eaccae0..3ffcf50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ tensorflow==2.12.0; sys_platform != 'darwin' opennsfw2==0.10.2 protobuf==4.23.2 tqdm==4.65.0 +threadpoolctl==3.1.0 \ No newline at end of file diff --git a/roop/core.py b/roop/core.py index 371191d..5b5612a 100644 --- a/roop/core.py +++ b/roop/core.py @@ -6,7 +6,6 @@ import sys import shutil import glob import argparse -import multiprocessing as mp import os import torch from pathlib import Path @@ -15,9 +14,9 @@ from tkinter import filedialog from opennsfw2 import predict_video_frames, predict_image from tkinter.filedialog import asksaveasfilename import webbrowser -import psutil import cv2 import threading +from threadpoolctl import threadpool_limits from PIL import Image, ImageTk import roop.globals @@ -28,30 +27,35 @@ from roop.analyser import get_face_single if 'ROCMExecutionProvider' in roop.globals.providers: del torch -pool = None -args = {} - signal.signal(signal.SIGINT, lambda signal_number, frame: quit()) parser = argparse.ArgumentParser() parser.add_argument('-f', '--face', help='use this face', dest='source_img') parser.add_argument('-t', '--target', help='replace this face', dest='target_path') parser.add_argument('-o', '--output', help='save output to this file', dest='output_file') -parser.add_argument('--gpu', help='choice your gpu vendor', dest='gpu', choices=['amd', 'nvidia']) parser.add_argument('--keep-fps', help='maintain original fps', dest='keep_fps', action='store_true', default=False) parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False) -parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', type=int) -parser.add_argument('--max-cores', help='number of cores to be use for CPU mode', dest='cores_count', type=int, default=max(psutil.cpu_count() - 2, 2)) parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False) +parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int) +parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int) +parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int) +parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['amd', 'intel', 'nvidia']) +args = {} for name, value in vars(parser.parse_args()).items(): args[name] = value -if 'gpu' in args: - roop.globals.gpu = args['gpu'] - -if 'all-faces' in args: +if 'all_faces' in args: roop.globals.all_faces = True +if 'cpu_threads' in args and args['cpu_threads']: + roop.globals.cpu_threads = args['cpu_threads'] + +if 'gpu_threads' in args and args['gpu_threads']: + roop.globals.gpu_threads = args['gpu_threads'] + +if 'gpu_vendor' in args and args['gpu_vendor']: + roop.globals.gpu_vendor = args['gpu_vendor'] + sep = "/" if os.name == "nt": sep = "\\" @@ -77,10 +81,10 @@ def pre_check(): model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx') if not os.path.isfile(model_path): quit('File "inswapper_128.onnx" does not exist!') - if roop.globals.gpu == 'amd': + if roop.globals.gpu_vendor == 'amd': if 'ROCMExecutionProvider' not in roop.globals.providers: quit("You are using --gpu=amd flag but ROCM isn't available or properly installed on your system.") - if roop.globals.gpu == 'nvidia': + if roop.globals.gpu_vendor == 'nvidia': CUDA_VERSION = torch.version.cuda CUDNN_VERSION = torch.backends.cudnn.version() if not torch.cuda.is_available() or not CUDA_VERSION: @@ -98,22 +102,14 @@ def pre_check(): def start_processing(): - frame_paths = args["frame_paths"] - n = len(frame_paths) // (args['cores_count']) - # single thread - if roop.globals.gpu == 'amd' or roop.globals.gpu == 'nvidia' or n < 2: + # gpu mode + if roop.globals.gpu_vendor is not None: + process_video(args['source_img'], args["frame_paths"]) + return + # cpu mode + with threadpool_limits(limits=roop.globals.cpu_threads): process_video(args['source_img'], args["frame_paths"]) return - # multithread if total frames to cpu cores ratio is greater than 2 - if n > 2: - processes = [] - for i in range(0, len(frame_paths), n): - p = pool.apply_async(process_video, args=(args['source_img'], frame_paths[i:i+n],)) - processes.append(p) - for p in processes: - p.get() - pool.close() - pool.join() def preview_image(image_path): @@ -194,8 +190,6 @@ def start(): if not args['output_file']: target_path = args['target_path'] args['output_file'] = rreplace(target_path, "/", "/swapped-", 1) if "/" in target_path else "swapped-" + target_path - global pool - pool = mp.Pool(args['cores_count']) target_path = args['target_path'] test_face = get_face_single(cv2.imread(args['source_img'])) if not test_face: @@ -241,10 +235,8 @@ def start(): def run(): global all_faces, keep_frames, limit_fps, status_label, window - pre_check() limit_resources() - if args['source_img']: args['cli_mode'] = True start() @@ -291,4 +283,4 @@ def run(): status_label = tk.Label(window, width=580, justify="center", text="Status: waiting for input...", fg="#2ecc71", bg="#2d3436") status_label.place(x=10,y=640,width=580,height=30) - window.mainloop() \ No newline at end of file + window.mainloop() diff --git a/roop/globals.py b/roop/globals.py index 34adafd..1c1bc49 100644 --- a/roop/globals.py +++ b/roop/globals.py @@ -1,8 +1,11 @@ import onnxruntime +import psutil -gpu = None all_faces = False log_level = 'error' +cpu_threads = max(psutil.cpu_count() - 2, 2) +gpu_threads = 8 +gpu_vendor = None providers = onnxruntime.get_available_providers() if 'TensorrtExecutionProvider' in providers: diff --git a/roop/swapper.py b/roop/swapper.py index bfc4d63..90b6b3e 100644 --- a/roop/swapper.py +++ b/roop/swapper.py @@ -4,6 +4,7 @@ import cv2 import insightface import roop.globals from roop.analyser import get_face_single, get_face_many +import onnxruntime FACE_SWAPPER = None @@ -11,8 +12,15 @@ FACE_SWAPPER = None def get_face_swapper(): global FACE_SWAPPER if FACE_SWAPPER is None: + session_options = onnxruntime.SessionOptions() + if roop.globals.gpu_vendor is not None: + session_options.intra_op_num_threads = roop.globals.gpu_threads + session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL + else: + session_options.enable_cpu_mem_arena = True + session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx') - FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers) + FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers, session_options=session_options) return FACE_SWAPPER diff --git a/roop/utils.py b/roop/utils.py index a9b0d05..d63807c 100644 --- a/roop/utils.py +++ b/roop/utils.py @@ -43,13 +43,13 @@ def set_fps(input_path, output_path, fps): def create_video(video_name, fps, output_dir): - hwaccel_option = '-hwaccel cuda' if roop.globals.gpu == 'nvidia' else '' + hwaccel_option = '-hwaccel cuda' if roop.globals.gpu_vendor == 'nvidia' else '' output_dir = path(output_dir) run_ffmpeg(f'{hwaccel_option} -framerate "{fps}" -i "{output_dir}{sep}%04d.png" -c:v libx264 -crf 7 -pix_fmt yuv420p -y "{output_dir}{sep}output.mp4"') def extract_frames(input_path, output_dir): - hwaccel_option = '-hwaccel cuda' if roop.globals.gpu == 'nvidia' else '' + hwaccel_option = '-hwaccel cuda' if roop.globals.gpu_vendor == 'nvidia' else '' input_path, output_dir = path(input_path), path(output_dir) run_ffmpeg(f' {hwaccel_option} -i "{input_path}" "{output_dir}{sep}%04d.png"')