Roop-multi changed the implementation of multi-threading processing for nvidia GPU. (#317)

* changed the multi-thread implementation for nvidia gpu

* Update requirements.txt

* Add files via upload

* fix core.py and swapper.py

* fix core.py

* code clean

* code clean

* doubles performance of gpu-mode

---------

Co-authored-by: Moeblack <Moeblack@kuroinekorachi@gmail.com>
Co-authored-by: Somdev Sangwan <s0md3v@gmail.com>
This commit is contained in:
Pikachu~~~ 2023-06-04 19:49:27 +08:00 committed by GitHub
parent 160a16f4b5
commit f200b4c7b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 106 additions and 88 deletions

View File

@ -14,4 +14,4 @@ tensorflow==2.12.0; sys_platform != 'darwin'
opennsfw2==0.10.2 opennsfw2==0.10.2
protobuf==4.23.2 protobuf==4.23.2
pynvml==11.5.0 pynvml==11.5.0
tqdm==4.65.0 tqdm==4.65.0

View File

@ -1,5 +1,4 @@
import insightface import insightface
import onnxruntime
import roop.globals import roop.globals
FACE_ANALYSER = None FACE_ANALYSER = None
@ -8,12 +7,6 @@ FACE_ANALYSER = None
def get_face_analyser(): def get_face_analyser():
global FACE_ANALYSER global FACE_ANALYSER
if FACE_ANALYSER is None: if FACE_ANALYSER is None:
session_options = onnxruntime.SessionOptions()
if roop.globals.gpu_vendor is not None:
session_options.intra_op_num_threads = roop.globals.gpu_threads
else:
session_options.intra_op_num_threads = roop.globals.cpu_threads
session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.providers) FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.providers)
FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640)) FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640))
return FACE_ANALYSER return FACE_ANALYSER

View File

@ -10,6 +10,7 @@ import signal
import shutil import shutil
import glob import glob
import argparse import argparse
import psutil
import torch import torch
from pathlib import Path from pathlib import Path
from opennsfw2 import predict_video_frames, predict_image from opennsfw2 import predict_video_frames, predict_image
@ -33,11 +34,12 @@ parser.add_argument('--keep-fps', help='maintain original fps', dest='keep_fps',
parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False) parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False)
parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False) parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False)
parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int) parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int)
parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int) parser.add_argument('--cpu-threads', help='number of threads to be use for CPU mode', dest='cpu_threads', type=int, default=max(psutil.cpu_count() - 2, 2))
parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int) parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int, default=4)
parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia']) parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia'])
args = {} args = {}
for name, value in vars(parser.parse_args()).items(): for name, value in vars(parser.parse_args()).items():
args[name] = value args[name] = value
@ -218,8 +220,7 @@ def save_file_handler(path: str):
def create_test_preview(frame_number): def create_test_preview(frame_number):
return process_faces( return process_faces(
get_face_single(cv2.imread(args['source_img'])), get_face_single(cv2.imread(args['source_img'])),
get_video_frame(args['target_path'], frame_number), get_video_frame(args['target_path'], frame_number)
None
) )

View File

@ -1,10 +1,9 @@
import onnxruntime import onnxruntime
import psutil
all_faces = False all_faces = None
log_level = 'error' log_level = 'error'
cpu_threads = max(psutil.cpu_count() - 2, 2) cpu_threads = None
gpu_threads = 8 gpu_threads = None
gpu_vendor = None gpu_vendor = None
providers = onnxruntime.get_available_providers() providers = onnxruntime.get_available_providers()

View File

@ -1,72 +1,97 @@
import os
from tqdm import tqdm import os
import torch from tqdm import tqdm
import onnxruntime import cv2
import cv2 import insightface
import insightface import threading
import roop.globals
import roop.globals from roop.analyser import get_face_single, get_face_many
from roop.analyser import get_face_single, get_face_many
FACE_SWAPPER = None
FACE_SWAPPER = None THREAD_LOCK = threading.Lock()
def get_face_swapper(): def get_face_swapper():
global FACE_SWAPPER global FACE_SWAPPER
if FACE_SWAPPER is None: with THREAD_LOCK:
session_options = onnxruntime.SessionOptions() if FACE_SWAPPER is None:
if roop.globals.gpu_vendor is not None: model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx')
session_options.intra_op_num_threads = roop.globals.gpu_threads FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers)
else: return FACE_SWAPPER
session_options.intra_op_num_threads = roop.globals.cpu_threads
session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
model_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../inswapper_128.onnx') def swap_face_in_frame(source_face, target_face, frame):
FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.providers, session_options=session_options) if target_face:
return FACE_SWAPPER return get_face_swapper().get(frame, target_face, source_face, paste_back=True)
return frame
def swap_face_in_frame(source_face, target_face, frame):
if target_face: def process_faces(source_face, target_frame):
return get_face_swapper().get(frame, target_face, source_face, paste_back=True) if roop.globals.all_faces:
return frame many_faces = get_face_many(target_frame)
if many_faces:
for face in many_faces:
def process_faces(source_face, target_frame, progress): target_frame = swap_face_in_frame(source_face, face, target_frame)
if roop.globals.all_faces: else:
many_faces = get_face_many(target_frame) face = get_face_single(target_frame)
if many_faces: if face:
for face in many_faces: target_frame = swap_face_in_frame(source_face, face, target_frame)
target_frame = swap_face_in_frame(source_face, face, target_frame) return target_frame
else:
face = get_face_single(target_frame)
if face: def process_frames(source_face, frame_paths, progress):
target_frame = swap_face_in_frame(source_face, face, target_frame) for frame_path in frame_paths:
return target_frame frame = cv2.imread(frame_path)
try:
result = process_faces(source_face, frame)
def process_video(source_img, frame_paths, preview_callback): cv2.imwrite(frame_path, result)
source_face = get_face_single(cv2.imread(source_img)) except Exception:
progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]' pass
progress.update(1)
with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
for frame_path in frame_paths:
if roop.globals.gpu_vendor == 'nvidia': def multi_process_frame(source_face,frame_paths,progress):
progress.set_postfix(cuda_utilization="{:02d}%".format(torch.cuda.utilization()), cuda_memory="{:02d}GB".format(torch.cuda.memory_usage()))
frame = cv2.imread(frame_path) # caculate the number of frames each threads processed
try: num_threads = roop.globals.gpu_threads
result = process_faces(source_face, frame, progress) num_frames_per_thread = len(frame_paths) // num_threads
cv2.imwrite(frame_path, result) remaining_frames = len(frame_paths) % num_threads
if preview_callback:
preview_callback(cv2.cvtColor(result, cv2.COLOR_BGR2RGB)) # initialize thread list
except Exception: threads = []
pass
progress.update(1) # create thread and launch
start_index = 0
for _ in range(num_threads):
def process_img(source_img, target_path, output_file): end_index = start_index + num_frames_per_thread
frame = cv2.imread(target_path) if remaining_frames > 0:
face = get_face_single(frame) end_index += 1
source_face = get_face_single(cv2.imread(source_img)) remaining_frames -= 1
result = get_face_swapper().get(frame, face, source_face, paste_back=True) thread_frame_paths = frame_paths[start_index:end_index]
cv2.imwrite(output_file, result) thread = threading.Thread(target=process_frames, args=(source_face, thread_frame_paths, progress))
print("\n\nImage saved as:", output_file, "\n\n") threads.append(thread)
thread.start()
start_index = end_index
# threading
for thread in threads:
thread.join()
def process_img(source_img, target_path, output_file):
frame = cv2.imread(target_path)
face = get_face_single(frame)
source_face = get_face_single(cv2.imread(source_img))
result = get_face_swapper().get(frame, face, source_face, paste_back=True)
cv2.imwrite(output_file, result)
print("\n\nImage saved as:", output_file, "\n\n")
def process_video(source_img, frame_paths, preview_callback):
source_face = get_face_single(cv2.imread(source_img))
progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
if roop.globals.gpu_vendor is not None:
multi_process_frame(source_face,frame_paths,progress)
else:
process_frames(source_img, frame_paths, progress)