Roop-multi changed the implementation of multi-threading processing for nvidia GPU. (#317)

* changed the multi-thread implementation for nvidia gpu

* Update requirements.txt

* Add files via upload

* fix core.py and swapper.py

* fix core.py

* code clean

* code clean

* doubles performance of gpu-mode

---------

Co-authored-by: Moeblack <Moeblack@kuroinekorachi@gmail.com>
Co-authored-by: Somdev Sangwan <s0md3v@gmail.com>
This commit is contained in:
Pikachu~~~ 2023-06-04 19:49:27 +08:00 committed by GitHub
parent 160a16f4b5
commit f200b4c7b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 106 additions and 88 deletions

View File

@ -1,5 +1,4 @@
import insightface import insightface
import onnxruntime
import roop.globals import roop.globals
FACE_ANALYSER = None FACE_ANALYSER = None
@ -8,12 +7,6 @@ FACE_ANALYSER = None
def get_face_analyser(): def get_face_analyser():
global FACE_ANALYSER global FACE_ANALYSER
if FACE_ANALYSER is None: if FACE_ANALYSER is None:
session_options = onnxruntime.SessionOptions()
if roop.globals.gpu_vendor is not None:
session_options.intra_op_num_threads = roop.globals.gpu_threads
else:
session_options.intra_op_num_threads = roop.globals.cpu_threads
session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.providers) FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.providers)
FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640)) FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640))
return FACE_ANALYSER return FACE_ANALYSER

View File

@ -10,6 +10,7 @@ import signal
import shutil import shutil
import glob import glob
import argparse import argparse
import psutil
import torch import torch
from pathlib import Path from pathlib import Path
from opennsfw2 import predict_video_frames, predict_image from opennsfw2 import predict_video_frames, predict_image
@ -33,11 +34,12 @@ parser.add_argument('--keep-fps', help='maintain original fps', dest='keep_fps',
parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False) parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False)
parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False) parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False)
parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int) parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int)
parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int) parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int, default=max(psutil.cpu_count() - 2, 2))
parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int) parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int, default=4)
parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia']) parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia'])
args = {} args = {}
for name, value in vars(parser.parse_args()).items(): for name, value in vars(parser.parse_args()).items():
args[name] = value args[name] = value
@ -218,8 +220,7 @@ def save_file_handler(path: str):
def create_test_preview(frame_number): def create_test_preview(frame_number):
return process_faces( return process_faces(
get_face_single(cv2.imread(args['source_img'])), get_face_single(cv2.imread(args['source_img'])),
get_video_frame(args['target_path'], frame_number), get_video_frame(args['target_path'], frame_number)
None
) )

View File

@ -1,10 +1,9 @@
import onnxruntime import onnxruntime
import psutil
all_faces = False all_faces = None
log_level = 'error' log_level = 'error'
cpu_threads = max(psutil.cpu_count() - 2, 2) cpu_threads = None
gpu_threads = 8 gpu_threads = None
gpu_vendor = None gpu_vendor = None
providers = onnxruntime.get_available_providers() providers = onnxruntime.get_available_providers()

View File

@ -1,27 +1,22 @@
import os import os
from tqdm import tqdm from tqdm import tqdm
import torch
import onnxruntime
import cv2 import cv2
import insightface import insightface
import threading
import roop.globals import roop.globals
from roop.analyser import get_face_single, get_face_many from roop.analyser import get_face_single, get_face_many
FACE_SWAPPER = None FACE_SWAPPER = None
THREAD_LOCK = threading.Lock()
def get_face_swapper(): def get_face_swapper():
global FACE_SWAPPER global FACE_SWAPPER
with THREAD_LOCK:
if FACE_SWAPPER is None: if FACE_SWAPPER is None:
session_options = onnxruntime.SessionOptions()
if roop.globals.gpu_vendor is not None:
session_options.intra_op_num_threads = roop.globals.gpu_threads
else:
session_options.intra_op_num_threads = roop.globals.cpu_threads
session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx') model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx')
FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers, session_options=session_options) FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers)
return FACE_SWAPPER return FACE_SWAPPER
@ -31,7 +26,7 @@ def swap_face_in_frame(source_face, target_face, frame):
return frame return frame
def process_faces(source_face, target_frame, progress): def process_faces(source_face, target_frame):
if roop.globals.all_faces: if roop.globals.all_faces:
many_faces = get_face_many(target_frame) many_faces = get_face_many(target_frame)
if many_faces: if many_faces:
@ -44,25 +39,45 @@ def process_faces(source_face, target_frame, progress):
return target_frame return target_frame
def process_video(source_img, frame_paths, preview_callback): def process_frames(source_face, frame_paths, progress):
source_face = get_face_single(cv2.imread(source_img))
progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
for frame_path in frame_paths: for frame_path in frame_paths:
if roop.globals.gpu_vendor == 'nvidia':
progress.set_postfix(cuda_utilization="{:02d}%".format(torch.cuda.utilization()), cuda_memory="{:02d}GB".format(torch.cuda.memory_usage()))
frame = cv2.imread(frame_path) frame = cv2.imread(frame_path)
try: try:
result = process_faces(source_face, frame, progress) result = process_faces(source_face, frame)
cv2.imwrite(frame_path, result) cv2.imwrite(frame_path, result)
if preview_callback:
preview_callback(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
except Exception: except Exception:
pass pass
progress.update(1) progress.update(1)
def multi_process_frame(source_face,frame_paths,progress):
# caculate the number of frames each threads processed
num_threads = roop.globals.gpu_threads
num_frames_per_thread = len(frame_paths) // num_threads
remaining_frames = len(frame_paths) % num_threads
# initialize thread list
threads = []
# create thread and launch
start_index = 0
for _ in range(num_threads):
end_index = start_index + num_frames_per_thread
if remaining_frames > 0:
end_index += 1
remaining_frames -= 1
thread_frame_paths = frame_paths[start_index:end_index]
thread = threading.Thread(target=process_frames, args=(source_face, thread_frame_paths, progress))
threads.append(thread)
thread.start()
start_index = end_index
# threading
for thread in threads:
thread.join()
def process_img(source_img, target_path, output_file): def process_img(source_img, target_path, output_file):
frame = cv2.imread(target_path) frame = cv2.imread(target_path)
face = get_face_single(frame) face = get_face_single(frame)
@ -70,3 +85,13 @@ def process_img(source_img, target_path, output_file):
result = get_face_swapper().get(frame, face, source_face, paste_back=True) result = get_face_swapper().get(frame, face, source_face, paste_back=True)
cv2.imwrite(output_file, result) cv2.imwrite(output_file, result)
print("\n\nImage saved as:", output_file, "\n\n") print("\n\nImage saved as:", output_file, "\n\n")
def process_video(source_img, frame_paths, preview_callback):
source_face = get_face_single(cv2.imread(source_img))
progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
if roop.globals.gpu_vendor is not None:
multi_process_frame(source_face,frame_paths,progress)
else:
process_frames(source_img, frame_paths, progress)