diff --git a/roop/core.py b/roop/core.py
index e91ae99..0957d05 100755
--- a/roop/core.py
+++ b/roop/core.py
@@ -2,6 +2,9 @@
 
 import os
 import sys
+# single thread doubles performance of gpu-mode - needs to be set before torch import
+if any(arg.startswith('--gpu-vendor=') for arg in sys.argv):
+    os.environ['OMP_NUM_THREADS'] = '1'
 import platform
 import signal
 import shutil
@@ -20,7 +23,6 @@ from roop.utils import is_img, detect_fps, set_fps, create_video, add_audio, ext
 from roop.analyser import get_face_single
 import roop.ui as ui
 
-
 signal.signal(signal.SIGINT, lambda signal_number, frame: quit())
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--face', help='use this face', dest='source_img')
@@ -57,6 +59,8 @@ sep = "/"
 if os.name == "nt":
     sep = "\\"
 
+POOL = None
+
 
 def limit_resources():
     if args['max_memory']:
@@ -140,12 +144,12 @@ def process_video_multi_cores(source_img, frame_paths):
     if n > 2:
         processes = []
         for i in range(0, len(frame_paths), n):
-            p = pool.apply_async(process_frames, args=(source_img, frame_paths[i:i+n],))
+            p = POOL.apply_async(process_video, args=(source_img, frame_paths[i:i + n],))
             processes.append(p)
         for p in processes:
             p.get()
-        pool.close()
-        pool.join()
+        POOL.close()
+        POOL.join()
 
 
 def start(preview_callback = None):
@@ -192,8 +196,8 @@ def start(preview_callback = None):
     ))
     status("swapping in progress...")
     if sys.platform != 'darwin' and roop.globals.gpu_vendor is None:
-        global pool
-        pool = mp.Pool(roop.globals.cpu_cores)
+        global POOL
+        POOL = mp.Pool(roop.globals.cpu_cores)
         process_video_multi_cores(args['source_img'], args['frame_paths'])
     else:
         process_video(args['source_img'], args["frame_paths"], preview_callback)
diff --git a/roop/swapper.py b/roop/swapper.py
index bc4730a..c5328a3 100644
--- a/roop/swapper.py
+++ b/roop/swapper.py
@@ -53,15 +53,11 @@ def process_frames(source_img, frame_paths, progress=None):
 
 
 def multi_process_frame(source_img, frame_paths, progress):
-
-    # caculate the number of frames each threads processed
+    threads = []
     num_threads = roop.globals.gpu_threads
     num_frames_per_thread = len(frame_paths) // num_threads
     remaining_frames = len(frame_paths) % num_threads
-    
-    # initialize thread list
-    threads = []
-            
+
     # create thread and launch
     start_index = 0
     for _ in range(num_threads):
@@ -92,7 +88,7 @@ def process_img(source_img, target_path, output_file):
 def process_video(source_img, frame_paths, preview_callback):
     progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
     with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
-        if roop.globals.gpu_vendor == "nvidia": # multi-threading breaks in AMD
+        if roop.globals.gpu_vendor is not None and roop.globals.gpu_threads > 0:
             multi_process_frame(source_img, frame_paths, progress)
         else:
             process_frames(source_img, frame_paths, progress)