diff --git a/README.md b/README.md
index 2c22d06..1416826 100644
--- a/README.md
+++ b/README.md
@@ -45,12 +45,12 @@ options:
   --all-faces           swap all faces in frame
   --max-memory MAX_MEMORY
                         maximum amount of RAM in GB to be used
-  --cpu-threads CPU_THREADS
-                        number of threads to be use for CPU mode
+  --cpu-cores CPU_CORES
+                        number of CPU cores to use
   --gpu-threads GPU_THREADS
-                        number of threads to be use for GPU moded
-  --gpu-vendor {amd,intel,nvidia}
-                        choice your gpu vendor
+                        number of threads to be use for the GPU
+  --gpu-vendor {apple,amd,intel,nvidia}
+                        choice your GPU vendor
 ```
 
 Looking for a CLI mode? Using the -f/--face argument will make the program in cli mode.
diff --git a/requirements.txt b/requirements.txt
index 0897eb9..bf185d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+
 numpy==1.23.5
 opencv-python==4.7.0.72
 onnx==1.14.0
@@ -5,7 +7,7 @@ insightface==0.7.3
 psutil==5.9.5
 tk==0.1.0
 pillow==9.5.0
-torch==2.0.1
+torch==2.0.1+cu118
 onnxruntime==1.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'
 onnxruntime-silicon==1.13.1; sys_platform == 'darwin' and platform_machine == 'arm64'
 onnxruntime-gpu==1.15.0; sys_platform != 'darwin'
@@ -13,5 +15,4 @@ tensorflow==2.13.0rc1; sys_platform == 'darwin'
 tensorflow==2.12.0; sys_platform != 'darwin'
 opennsfw2==0.10.2
 protobuf==4.23.2
-pynvml==11.5.0
 tqdm==4.65.0
\ No newline at end of file
diff --git a/roop/core.py b/roop/core.py
index 96eb1c7..307e761 100755
--- a/roop/core.py
+++ b/roop/core.py
@@ -2,6 +2,9 @@
 
 import os
 import sys
+# single thread doubles performance of gpu-mode - needs to be set before torch import
+if any(arg.startswith('--gpu-vendor=') for arg in sys.argv):
+    os.environ['OMP_NUM_THREADS'] = '1'
 import platform
 import signal
 import shutil
@@ -20,7 +23,6 @@ from roop.utils import is_img, detect_fps, set_fps, create_video, add_audio, ext
 from roop.analyser import get_face_single
 import roop.ui as ui
 
-
 signal.signal(signal.SIGINT, lambda signal_number, frame: quit())
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--face', help='use this face', dest='source_img')
@@ -30,26 +32,31 @@ parser.add_argument('--keep-fps', help='maintain original fps', dest='keep_fps',
 parser.add_argument('--keep-frames', help='keep frames directory', dest='keep_frames', action='store_true', default=False)
 parser.add_argument('--all-faces', help='swap all faces in frame', dest='all_faces', action='store_true', default=False)
 parser.add_argument('--max-memory', help='maximum amount of RAM in GB to be used', dest='max_memory', type=int)
-parser.add_argument('--max-cores', help='number of cores to use at max', dest='max_cores', type=int, default=max(psutil.cpu_count() - 2, 2))
-parser.add_argument('--gpu-threads', help='number of threads to be use for GPU mode', dest='gpu_threads', type=int, default=4)
-parser.add_argument('--gpu-vendor', help='choice your gpu vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia'])
+parser.add_argument('--cpu-cores', help='number of CPU cores to use', dest='cpu_cores', type=int, default=max(psutil.cpu_count() / 2, 2))
+parser.add_argument('--gpu-threads', help='number of threads to be use for the GPU', dest='gpu_threads', type=int, default=4)
+parser.add_argument('--gpu-vendor', help='choice your GPU vendor', dest='gpu_vendor', choices=['apple', 'amd', 'intel', 'nvidia'])
 
-args = {}
-
-for name, value in vars(parser.parse_args()).items():
-    args[name] = value
+args = parser.parse_known_args()[0]
 
 if 'all_faces' in args:
     roop.globals.all_faces = True
 
-if args['max_cores']:
-    roop.globals.max_cores = args['max_cores']
+if args.cpu_cores:
+    roop.globals.cpu_cores = int(args.cpu_cores)
 
-if args['gpu_threads']:
-    roop.globals.gpu_threads = args['gpu_threads']
+# cpu thread fix for mac
+if sys.platform == 'darwin':
+    roop.globals.cpu_cores = 1
 
-if args['gpu_vendor']:
-    roop.globals.gpu_vendor = args['gpu_vendor']
+if args.gpu_threads:
+    roop.globals.gpu_threads = int(args.gpu_threads)
+
+# gpu thread fix for amd
+if args.gpu_vendor == 'amd':
+    roop.globals.gpu_threads = 1
+
+if args.gpu_vendor:
+    roop.globals.gpu_vendor = args.gpu_vendor
 else:
     roop.globals.providers = ['CPUExecutionProvider']
 
@@ -59,8 +66,8 @@ if os.name == "nt":
 
 
 def limit_resources():
-    if args['max_memory']:
-        memory = args['max_memory'] * 1024 * 1024 * 1024
+    if args.max_memory:
+        memory = args.max_memory * 1024 * 1024 * 1024
         if str(platform.system()).lower() == 'windows':
             import ctypes
             kernel32 = ctypes.windll.kernel32
@@ -81,13 +88,13 @@ def pre_check():
     if roop.globals.gpu_vendor == 'apple':
         if 'CoreMLExecutionProvider' not in roop.globals.providers:
             quit("You are using --gpu=apple flag but CoreML isn't available or properly installed on your system.")
-    elif roop.globals.gpu_vendor == 'amd':
+    if roop.globals.gpu_vendor == 'amd':
         if 'ROCMExecutionProvider' not in roop.globals.providers:
             quit("You are using --gpu=amd flag but ROCM isn't available or properly installed on your system.")
-    elif roop.globals.gpu_vendor == 'nvidia':
+    if roop.globals.gpu_vendor == 'nvidia':
         CUDA_VERSION = torch.version.cuda
         CUDNN_VERSION = torch.backends.cudnn.version()
-        if not torch.cuda.is_available() or not CUDA_VERSION:
+        if not torch.cuda.is_available():
             quit("You are using --gpu=nvidia flag but CUDA isn't available or properly installed on your system.")
         if CUDA_VERSION > '11.8':
             quit(f"CUDA version {CUDA_VERSION} is not supported - please downgrade to 11.8")
@@ -97,8 +104,6 @@ def pre_check():
             quit(f"CUDNN version {CUDNN_VERSION} is not supported - please upgrade to 8.9.1")
         if CUDNN_VERSION > 8910:
             quit(f"CUDNN version {CUDNN_VERSION} is not supported - please downgrade to 8.9.1")
-    else:
-        roop.globals.providers = ['CPUExecutionProvider']
 
 
 def get_video_frame(video_path, frame_number = 1):
@@ -138,40 +143,40 @@ def status(string):
 
 
 def process_video_multi_cores(source_img, frame_paths):
-    n = len(frame_paths) // roop.globals.max_cores
+    n = len(frame_paths) // roop.globals.cpu_cores
     if n > 2:
         processes = []
         for i in range(0, len(frame_paths), n):
-            p = pool.apply_async(process_frames, args=(source_img, frame_paths[i:i+n],))
+            p = POOL.apply_async(process_video, args=(source_img, frame_paths[i:i + n],))
             processes.append(p)
         for p in processes:
             p.get()
-        pool.close()
-        pool.join()
+        POOL.close()
+        POOL.join()
 
 
 def start(preview_callback = None):
-    if not args['source_img'] or not os.path.isfile(args['source_img']):
+    if not args.source_img or not os.path.isfile(args.source_img):
         print("\n[WARNING] Please select an image containing a face.")
         return
-    elif not args['target_path'] or not os.path.isfile(args['target_path']):
+    elif not args.target_path or not os.path.isfile(args.target_path):
         print("\n[WARNING] Please select a video/image to swap face in.")
         return
-    if not args['output_file']:
-        target_path = args['target_path']
-        args['output_file'] = rreplace(target_path, "/", "/swapped-", 1) if "/" in target_path else "swapped-" + target_path
-    target_path = args['target_path']
-    test_face = get_face_single(cv2.imread(args['source_img']))
+    if not args.output_file:
+        target_path = args.target_path
+        args.output_file = rreplace(target_path, "/", "/swapped-", 1) if "/" in target_path else "swapped-" + target_path
+    target_path = args.target_path
+    test_face = get_face_single(cv2.imread(args.source_img))
     if not test_face:
         print("\n[WARNING] No face detected in source image. Please try with another one.\n")
         return
     if is_img(target_path):
         if predict_image(target_path) > 0.85:
             quit()
-        process_img(args['source_img'], target_path, args['output_file'])
+        process_img(args.source_img, target_path, args.output_file)
         status("swap successful!")
         return
-    seconds, probabilities = predict_video_frames(video_path=args['target_path'], frame_interval=100)
+    seconds, probabilities = predict_video_frames(video_path=args.target_path, frame_interval=100)
     if any(probability > 0.85 for probability in probabilities):
         quit()
     video_name_full = target_path.split("/")[-1]
@@ -180,7 +185,7 @@ def start(preview_callback = None):
     Path(output_dir).mkdir(exist_ok=True)
     status("detecting video's FPS...")
     fps, exact_fps = detect_fps(target_path)
-    if not args['keep_fps'] and fps > 30:
+    if not args.keep_fps and fps > 30:
         this_path = output_dir + "/" + video_name + ".mp4"
         set_fps(target_path, this_path, 30)
         target_path, exact_fps = this_path, 30
@@ -188,33 +193,33 @@ def start(preview_callback = None):
         shutil.copy(target_path, output_dir)
     status("extracting frames...")
     extract_frames(target_path, output_dir)
-    args['frame_paths'] = tuple(sorted(
+    args.frame_paths = tuple(sorted(
         glob.glob(output_dir + "/*.png"),
         key=lambda x: int(x.split(sep)[-1].replace(".png", ""))
     ))
     status("swapping in progress...")
-    if sys.platform != 'darwin' and not args['gpu_vendor']:
-        global pool
-        pool = mp.Pool(roop.globals.max_cores)
-        process_video_multi_cores(args['source_img'], args['frame_paths'])
+    if roop.globals.gpu_vendor is None and roop.globals.cpu_cores > 0:
+        global POOL
+        POOL = mp.Pool(roop.globals.cpu_cores)
+        process_video_multi_cores(args.source_img, args.frame_paths)
     else:
-        process_video(args['source_img'], args["frame_paths"], preview_callback)
+        process_video(args.source_img, args.frame_paths)
     status("creating video...")
     create_video(video_name, exact_fps, output_dir)
     status("adding audio...")
-    add_audio(output_dir, target_path, video_name_full, args['keep_frames'], args['output_file'])
-    save_path = args['output_file'] if args['output_file'] else output_dir + "/" + video_name + ".mp4"
+    add_audio(output_dir, target_path, video_name_full, args.keep_frames, args.output_file)
+    save_path = args.output_file if args.output_file else output_dir + "/" + video_name + ".mp4"
     print("\n\nVideo saved as:", save_path, "\n\n")
     status("swap successful!")
 
 
 def select_face_handler(path: str):
-    args['source_img'] = path
+    args.source_img = path
 
 
 def select_target_handler(path: str):
-    args['target_path'] = path
-    return preview_video(args['target_path'])
+    args.target_path = path
+    return preview_video(args.target_path)
 
 
 def toggle_all_faces_handler(value: int):
@@ -222,21 +227,21 @@ def toggle_all_faces_handler(value: int):
 
 
 def toggle_fps_limit_handler(value: int):
-    args['keep_fps'] = int(value != 1)
+    args.keep_fps = int(value != 1)
 
 
 def toggle_keep_frames_handler(value: int):
-    args['keep_frames'] = value
+    args.keep_frames = value
 
 
 def save_file_handler(path: str):
-    args['output_file'] = path
+    args.output_file = path
 
 
 def create_test_preview(frame_number):
     return process_faces(
-        get_face_single(cv2.imread(args['source_img'])), 
-        get_video_frame(args['target_path'], frame_number)
+        get_face_single(cv2.imread(args.source_img)),
+        get_video_frame(args.target_path, frame_number)
     )
 
 
@@ -245,16 +250,16 @@ def run():
 
     pre_check()
     limit_resources()
-    if args['source_img']:
-        args['cli_mode'] = True
+    if args.source_img:
+        args.cli_mode = True
         start()
         quit()
 
     window = ui.init(
         {
             'all_faces': roop.globals.all_faces,
-            'keep_fps': args['keep_fps'],
-            'keep_frames': args['keep_frames']
+            'keep_fps': args.keep_fps,
+            'keep_frames': args.keep_frames
         },
         select_face_handler,
         select_target_handler,
diff --git a/roop/globals.py b/roop/globals.py
index da3cfac..986bf91 100644
--- a/roop/globals.py
+++ b/roop/globals.py
@@ -2,7 +2,7 @@ import onnxruntime
 
 all_faces = None
 log_level = 'error'
-cpu_threads = None
+cpu_cores = None
 gpu_threads = None
 gpu_vendor = None
 providers = onnxruntime.get_available_providers()
diff --git a/roop/swapper.py b/roop/swapper.py
index bc4730a..769a39e 100644
--- a/roop/swapper.py
+++ b/roop/swapper.py
@@ -53,15 +53,11 @@ def process_frames(source_img, frame_paths, progress=None):
 
 
 def multi_process_frame(source_img, frame_paths, progress):
-
-    # caculate the number of frames each threads processed
+    threads = []
     num_threads = roop.globals.gpu_threads
     num_frames_per_thread = len(frame_paths) // num_threads
     remaining_frames = len(frame_paths) % num_threads
-    
-    # initialize thread list
-    threads = []
-            
+
     # create thread and launch
     start_index = 0
     for _ in range(num_threads):
@@ -89,10 +85,10 @@ def process_img(source_img, target_path, output_file):
     print("\n\nImage saved as:", output_file, "\n\n")
 
 
-def process_video(source_img, frame_paths, preview_callback):
+def process_video(source_img, frame_paths):
     progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
     with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
-        if roop.globals.gpu_vendor == "nvidia": # multi-threading breaks in AMD
+        if roop.globals.gpu_vendor is not None and roop.globals.gpu_threads > 0:
             multi_process_frame(source_img, frame_paths, progress)
         else:
             process_frames(source_img, frame_paths, progress)