Added subsampling

Upgraded gradio
C0untFloyd · Jul 14, 2024 · 126fd69 · 126fd69
1 parent 4236385
commit 126fd69
Show file tree

Hide file tree

Showing 16 changed files with 109 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -59,6 +59,12 @@ Additional commandline arguments are currently unsupported and settings should b
 
 ### Changelog
 
+**14.07.2024** v4.1.0
+
+- Added subsample upscaling to increase swap resolution
+- Upgraded gradio
+
+
 **22.04.2024** v3.9.0
 
 - Bugfix: Face detection bounding box corrupt values at weird angles

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 
 numpy==1.26.4
-gradio==4.32.1
+gradio==4.38.1
 opencv-python==4.9.0.80
 onnx==1.16.0
 insightface==0.7.3

diff --git a/roop/ProcessMgr.py b/roop/ProcessMgr.py
@@ -6,7 +6,7 @@
 from enum import Enum
 from roop.ProcessOptions import ProcessOptions
 
-from roop.face_util import get_first_face, get_all_faces, rotate_image_180, rotate_anticlockwise, rotate_clockwise, clamp_cut_values
+from roop.face_util import get_first_face, get_all_faces, rotate_anticlockwise, rotate_clockwise, clamp_cut_values
 from roop.utilities import compute_cosine_distance, get_device, str_to_class
 import roop.vr_util as vr
 
@@ -20,6 +20,7 @@
 import roop.globals
 
 
+
 # Poor man's enum to be able to compare to int
 class eNoFaceAction():
     USE_ORIGINAL_FRAME = 0
@@ -44,6 +45,7 @@ def pick_queue(queue: Queue[str], queue_per_future: int) -> List[str]:
     return queues
 
 
+
 class ProcessMgr():
     input_face_datas = []
     target_face_datas = []
@@ -317,11 +319,6 @@ def update_progress(self, progress: Any = None) -> None:
             self.progress_gradio((progress.n, self.total_frames), desc='Processing', total=self.total_frames, unit='frames')
 
 
-# https://github.com/deepinsight/insightface#third-party-re-implementation-of-arcface
-# https://github.com/deepinsight/insightface/blob/master/alignment/coordinate_reg/image_infer.py
-# https://github.com/deepinsight/insightface/issues/1350
-# https://github.com/linghu8812/tensorrt_inference
-
 
     def process_frame(self, frame:Frame):
         if len(self.input_face_datas) < 1 and not self.options.show_face_masking:
@@ -541,17 +538,30 @@ def process_face(self,face_index, target_face:Face, frame:Frame):
 
             # img = vr.GetPerspective(frame, 90, theta, phi, 1280, 1280)  # Generate perspective image
 
-        fake_frame = None
-        aligned_img, M = align_crop(frame, target_face.kps, 128)
+
+        """ Code ported/adapted from Facefusion which borrowed the idea from Rope:
+            Kind of subsampling the cutout and aligned face image and faceswapping slices of it up to
+            the desired output resolution. This works around the current resolution limitations without using enhancers.
+        """
+        model_output_size = 128
+        subsample_size = self.options.subsample_size
+        subsample_total = subsample_size // model_output_size
+        aligned_img, M = align_crop(frame, target_face.kps, subsample_size)
+
         fake_frame = aligned_img
-        swap_frame = aligned_img
         target_face.matrix = M
+
         for p in self.processors:
             if p.type == 'swap':
-                if inputface is not None:
+                swap_result_frames = []
+                subsample_frames = self.implode_pixel_boost(aligned_img, model_output_size, subsample_total)
+                for sliced_frame in subsample_frames:
                     for _ in range(0,self.options.num_swap_steps):
-                        swap_frame = p.Run(inputface, target_face, swap_frame)
-                fake_frame = swap_frame
+                        sliced_frame = self.prepare_crop_frame(sliced_frame)
+                        sliced_frame = p.Run(inputface, target_face, sliced_frame)
+                        sliced_frame = self.normalize_swap_frame(sliced_frame)
+                    swap_result_frames.append(sliced_frame)
+                fake_frame = self.explode_pixel_boost(swap_result_frames, model_output_size, subsample_total, subsample_size)
                 scale_factor = 0.0
             elif p.type == 'mask':
                 fake_frame = self.process_mask(p, aligned_img, fake_frame)
@@ -560,8 +570,8 @@ def process_face(self,face_index, target_face:Face, frame:Frame):
 
         upscale = 512
         orig_width = fake_frame.shape[1]
-
-        fake_frame = cv2.resize(fake_frame, (upscale, upscale), cv2.INTER_CUBIC)
+        if orig_width != upscale:
+            fake_frame = cv2.resize(fake_frame, (upscale, upscale), cv2.INTER_CUBIC)
         mask_offsets = (0,0,0,0,1,20) if inputface is None else inputface.mask_offsets
 
 
@@ -673,6 +683,43 @@ def blur_area(self, img_matte, num_erosion_iterations, blur_amount):
         return cv2.GaussianBlur(img_matte, blur_size, 0)
 
 
+    def prepare_crop_frame(self, swap_frame):
+        model_type = 'inswapper'
+        model_mean = [0.0, 0.0, 0.0]
+        model_standard_deviation = [1.0, 1.0, 1.0]
+
+        if model_type == 'ghost':
+            swap_frame = swap_frame[:, :, ::-1] / 127.5 - 1
+        else:
+            swap_frame = swap_frame[:, :, ::-1] / 255.0
+        swap_frame = (swap_frame - model_mean) / model_standard_deviation
+        swap_frame = swap_frame.transpose(2, 0, 1)
+        swap_frame = np.expand_dims(swap_frame, axis = 0).astype(np.float32)
+        return swap_frame
+
+
+    def normalize_swap_frame(self, swap_frame):
+        model_type = 'inswapper'
+        swap_frame = swap_frame.transpose(1, 2, 0)
+
+        if model_type == 'ghost':
+            swap_frame = (swap_frame * 127.5 + 127.5).round()
+        else:
+            swap_frame = (swap_frame * 255.0).round()
+        swap_frame = swap_frame[:, :, ::-1]
+        return swap_frame
+
+    def implode_pixel_boost(self, aligned_face_frame, model_size, pixel_boost_total : int):
+        subsample_frame = aligned_face_frame.reshape(model_size, pixel_boost_total, model_size, pixel_boost_total, 3)
+        subsample_frame = subsample_frame.transpose(1, 3, 0, 2, 4).reshape(pixel_boost_total ** 2, model_size, model_size, 3)
+        return subsample_frame
+
+
+    def explode_pixel_boost(self, subsample_frame, model_size, pixel_boost_total, pixel_boost_size):
+        final_frame = np.stack(subsample_frame, axis = 0).reshape(pixel_boost_total, pixel_boost_total, model_size, model_size, 3)
+        final_frame = final_frame.transpose(2, 0, 3, 1, 4).reshape(pixel_boost_size, pixel_boost_size, 3)
+        return final_frame
+
     def process_mask(self, processor, frame:Frame, target:Frame):
         img_mask = processor.Run(frame, self.options.masking_text)
         img_mask = cv2.resize(img_mask, (target.shape[1], target.shape[0]))

diff --git a/roop/ProcessOptions.py b/roop/ProcessOptions.py
@@ -1,6 +1,6 @@
 class ProcessOptions:
 
-    def __init__(self, processordefines:dict, face_distance,  blend_ratio, swap_mode, selected_index, masking_text, imagemask, num_steps, show_face_area, show_mask=False):
+    def __init__(self, processordefines:dict, face_distance,  blend_ratio, swap_mode, selected_index, masking_text, imagemask, num_steps, subsample_size, show_face_area, show_mask=False):
         self.processors = processordefines
         self.face_distance_threshold = face_distance
         self.blend_ratio = blend_ratio
@@ -10,4 +10,5 @@ def __init__(self, processordefines:dict, face_distance,  blend_ratio, swap_mode
         self.imagemask = imagemask
         self.num_swap_steps = num_steps
         self.show_face_area_overlay = show_face_area
-        self.show_face_masking = show_mask
+        self.show_face_masking = show_mask
+        self.subsample_size = subsample_size
diff --git a/roop/core.py b/roop/core.py
@@ -214,7 +214,9 @@ def batch_process_regular(files:list[ProcessEntry], masking_engine:str, new_clip
     mask = imagemask["layers"][0] if imagemask is not None else None
     if len(roop.globals.INPUT_FACESETS) <= selected_index:
         selected_index = 0
-    options = ProcessOptions(get_processing_plugins(masking_engine), roop.globals.distance_threshold, roop.globals.blend_ratio, roop.globals.face_swap_mode, selected_index, new_clip_text, mask, num_swap_steps, False)
+    options = ProcessOptions(get_processing_plugins(masking_engine), roop.globals.distance_threshold, roop.globals.blend_ratio,
+                              roop.globals.face_swap_mode, selected_index, new_clip_text, mask, num_swap_steps,
+                              roop.globals.subsample_size, False)
     process_mgr.initialize(roop.globals.INPUT_FACESETS, roop.globals.TARGET_FACES, options)
     batch_process(files, use_new_method)
     return

diff --git a/roop/face_util.py b/roop/face_util.py
@@ -210,15 +210,18 @@ def rotate_image_180(image):
 )
 
 
-def estimate_norm(lmk, image_size=112, mode="arcface"):
+def estimate_norm(lmk, image_size=112):
     assert lmk.shape == (5, 2)
-    assert image_size % 112 == 0 or image_size % 128 == 0
     if image_size % 112 == 0:
         ratio = float(image_size) / 112.0
         diff_x = 0
-    else:
+    elif image_size % 128 == 0:
         ratio = float(image_size) / 128.0
         diff_x = 8.0 * ratio
+    elif image_size % 512 == 0:
+        ratio = float(image_size) / 512.0
+        diff_x = 32.0 * ratio
+
     dst = arcface_dst * ratio
     dst[:, 0] += diff_x
     tform = trans.SimilarityTransform()
@@ -230,7 +233,7 @@ def estimate_norm(lmk, image_size=112, mode="arcface"):
 
 # aligned, M = norm_crop2(f[1], face.kps, 512)
 def align_crop(img, landmark, image_size=112, mode="arcface"):
-    M = estimate_norm(landmark, image_size, mode)
+    M = estimate_norm(landmark, image_size)
     warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
     return warped, M
 

diff --git a/roop/globals.py b/roop/globals.py
@@ -26,6 +26,7 @@
 headless = None
 log_level = 'error'
 selected_enhancer = None
+subsample_size = 128
 face_swap_mode = None
 blend_ratio = 0.5
 distance_threshold = 0.65

diff --git a/roop/metadata.py b/roop/metadata.py
@@ -1,2 +1,2 @@
 name = 'roop unleashed'
-version = '4.0.0'
+version = '4.1.0'
diff --git a/roop/processors/FaceSwapInsightFace.py b/roop/processors/FaceSwapInsightFace.py
@@ -36,26 +36,18 @@ def Initialize(self, plugin_options:dict):
             self.model_swap_insightface = onnxruntime.InferenceSession(model_path, sess_options, providers=roop.globals.execution_providers)
 
 
-    
+
     def Run(self, source_face: Face, target_face: Face, temp_frame: Frame) -> Frame:
-        blob = cv2.dnn.blobFromImage(temp_frame, 1.0 / self.input_std, (128, 128),
-                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
         latent = source_face.normed_embedding.reshape((1,-1))
         latent = np.dot(latent, self.emap)
         latent /= np.linalg.norm(latent)
         io_binding = self.model_swap_insightface.io_binding()           
-        io_binding.bind_cpu_input("target", blob)
+        io_binding.bind_cpu_input("target", temp_frame)
         io_binding.bind_cpu_input("source", latent)
         io_binding.bind_output("output", self.devicename)
         self.model_swap_insightface.run_with_iobinding(io_binding)
         ort_outs = io_binding.copy_outputs_to_cpu()[0]
-        img_fake = ort_outs.transpose((0,2,3,1))[0]
-        return np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1]
-
-
-        img_fake, M = self.model_swap_insightface.get(temp_frame, target_face, source_face, paste_back=False)
-    #    target_face.matrix = M
-    #    return img_fake 
+        return ort_outs[0]
 
 
     def Release(self):

diff --git a/roop/virtualcam.py b/roop/virtualcam.py
@@ -44,10 +44,11 @@ def virtualcamera(streamobs, cam_num,width,height):
         print(f'Using {cam.native_fmt}')
     else:
         print(f'Not streaming to virtual camera!')
+    subsample_size = roop.globals.subsample_size
 
     # always use xseg masking
     options = ProcessOptions(get_processing_plugins("mask_xseg"), roop.globals.distance_threshold, roop.globals.blend_ratio,
-                              "all", 0, None, None, 1, False)
+                              "all", 0, None, None, 1, subsample_size, False)
     while cam_active:
         ret, frame = cap.read()
         if not ret:

diff --git a/ui/globals.py b/ui/globals.py
@@ -4,6 +4,7 @@
 ui_SELECTED_INPUT_FACE_INDEX = 0
 
 ui_selected_enhancer = None
+ui_upscale = None
 ui_blend_ratio = None
 ui_input_thumbs = []
 ui_target_thumbs = []

diff --git a/ui/main.py b/ui/main.py
@@ -57,7 +57,7 @@ def run():
         if server_port <= 0:
             server_port = None
         ssl_verify = False if server_name == '0.0.0.0' else True
-        with gr.Blocks(title=f'{roop.metadata.name} {roop.metadata.version}', theme=roop.globals.CFG.selected_theme, css=mycss) as ui:
+        with gr.Blocks(title=f'{roop.metadata.name} {roop.metadata.version}', theme=roop.globals.CFG.selected_theme, css=mycss, delete_cache=(60, 86400)) as ui:
             with gr.Row(variant='compact'):
                     gr.Markdown(f"### [{roop.metadata.name} {roop.metadata.version}](https://github.com/C0untFloyd/roop-unleashed)")
                     gr.HTML(util.create_version_html(), elem_id="versions")

diff --git a/ui/tabs/extras_tab.py b/ui/tabs/extras_tab.py
@@ -175,7 +175,7 @@ def on_frame_process(files, filterselection, upscaleselection):
     filter = next((x for x in frame_upscalers_map.keys() if x == upscaleselection), None)
     if filter is not None:
         processoroptions.update(frame_upscalers_map[filter])
-    options = ProcessOptions(processoroptions, 0,  0, "all", 0, None, None, None, False)
+    options = ProcessOptions(processoroptions, 0,  0, "all", 0, None, None, 0, 128, False)
     batch_process_with_options(list_files_process, options, None)
     outdir = pathlib.Path(roop.globals.output_path)
     outfiles = [str(item) for item in outdir.rglob("*") if item.is_file()]

diff --git a/ui/tabs/faceswap_tab.py b/ui/tabs/faceswap_tab.py
@@ -99,13 +99,15 @@ def faceswap_tab():
             with gr.Column(scale=1):
                 selected_face_detection = gr.Dropdown(["First found", "All female", "All male", "All faces", "Selected face"], value="First found", label="Specify face selection for swapping")
             with gr.Column(scale=1):
+                num_swap_steps = gr.Slider(1, 5, value=1, step=1.0, label="Number of swapping steps", info="More steps may increase likeness")
+            with gr.Column(scale=2):
                 ui.globals.ui_selected_enhancer = gr.Dropdown(["None", "Codeformer", "DMDNet", "GFPGAN", "GPEN", "Restoreformer++"], value="None", label="Select post-processing")
 
         with gr.Row(variant='panel'):
             with gr.Column(scale=1):
                 max_face_distance = gr.Slider(0.01, 1.0, value=0.65, label="Max Face Similarity Threshold", info="0.0 = identical 1.0 = no similarity")
             with gr.Column(scale=1):
-                num_swap_steps = gr.Slider(1, 5, value=1, step=1.0, label="Number of swapping steps", info="More steps can increase likeness")
+                ui.globals.ui_upscale = gr.Dropdown(["128px", "256px", "512px"], value="128px", label="Subsample upscale to", interactive=True)
             with gr.Column(scale=2):
                 ui.globals.ui_blend_ratio = gr.Slider(0.0, 1.0, value=0.65, label="Original/Enhanced image blend ratio", info="Only used with active post-processing")
 
@@ -140,7 +142,7 @@ def faceswap_tab():
                 resultvideo = gr.Video(label='Final Video', interactive=False, visible=False)
 
     previewinputs = [preview_frame_num, bt_destfiles, fake_preview, ui.globals.ui_selected_enhancer, selected_face_detection,
-                        max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text, no_face_action, vr_mode, autorotate, maskimage, chk_showmaskoffsets, num_swap_steps]
+                        max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text, no_face_action, vr_mode, autorotate, maskimage, chk_showmaskoffsets, num_swap_steps, ui.globals.ui_upscale]
     previewoutputs = [previewimage, maskimage, preview_frame_num] 
     input_faces.select(on_select_input_face, None, None).then(fn=on_preview_frame_changed, inputs=previewinputs, outputs=previewoutputs)
     bt_remove_selected_input_face.click(fn=remove_selected_input_face, outputs=[input_faces])
@@ -176,7 +178,7 @@ def faceswap_tab():
 
     start_event = bt_start.click(fn=start_swap, 
         inputs=[ui.globals.ui_selected_enhancer, selected_face_detection, roop.globals.keep_frames, roop.globals.wait_after_extraction,
-                    roop.globals.skip_audio, max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text,video_swapping_method, no_face_action, vr_mode, autorotate, num_swap_steps, maskimage],
+                    roop.globals.skip_audio, max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text,video_swapping_method, no_face_action, vr_mode, autorotate, num_swap_steps, ui.globals.ui_upscale, maskimage],
         outputs=[bt_start, bt_stop, resultfiles], show_progress='full')
     after_swap_event = start_event.then(fn=on_resultfiles_finished, inputs=[resultfiles], outputs=[resultimage, resultvideo])
 
@@ -407,7 +409,7 @@ def on_end_face_selection():
 
 
 def on_preview_frame_changed(frame_num, files, fake_preview, enhancer, detection, face_distance, blend_ratio,
-                              selected_mask_engine, clip_text, no_face_action, vr_mode, auto_rotate, maskimage, show_face_area, num_steps):
+                              selected_mask_engine, clip_text, no_face_action, vr_mode, auto_rotate, maskimage, show_face_area, num_steps, upsample):
     global SELECTED_INPUT_FACE_INDEX, manual_masking, current_video_fps
 
     from roop.core import live_swap, get_processing_plugins
@@ -454,6 +456,8 @@ def on_preview_frame_changed(frame_num, files, fake_preview, enhancer, detection
     roop.globals.no_face_action = index_of_no_face_action(no_face_action)
     roop.globals.vr_mode = vr_mode
     roop.globals.autorotate_faces = auto_rotate
+    roop.globals.subsample_size = int(upsample[:3])
+
 
     mask_engine = map_mask_engine(selected_mask_engine, clip_text)
 
@@ -464,7 +468,7 @@ def on_preview_frame_changed(frame_num, files, fake_preview, enhancer, detection
         face_index = 0
 
     options = ProcessOptions(get_processing_plugins(mask_engine), roop.globals.distance_threshold, roop.globals.blend_ratio,
-                              roop.globals.face_swap_mode, face_index, clip_text, maskimage, num_steps, show_face_area)
+                              roop.globals.face_swap_mode, face_index, clip_text, maskimage, num_steps, roop.globals.subsample_size, show_face_area)
 
     current_frame = live_swap(current_frame, options)
     if current_frame is None:
@@ -538,7 +542,7 @@ def on_preview_mask(frame_num, files, clip_text, mask_engine):
     elif mask_engine == "DFL XSeg":
         mask_engine = "mask_xseg"
     options = ProcessOptions(get_processing_plugins(mask_engine), roop.globals.distance_threshold, roop.globals.blend_ratio,
-                              "all", 0, clip_text, None, 0, False, True)
+                              "all", 0, clip_text, None, 0, 128, False, True)
 
     current_frame = live_swap(current_frame, options)
     return util.convert_to_gradio(current_frame)
@@ -576,7 +580,7 @@ def translate_swap_mode(dropdown_text):
 
 
 def start_swap( enhancer, detection, keep_frames, wait_after_extraction, skip_audio, face_distance, blend_ratio,
-                selected_mask_engine, clip_text, processing_method, no_face_action, vr_mode, autorotate, num_swap_steps, imagemask, progress=gr.Progress()):
+                selected_mask_engine, clip_text, processing_method, no_face_action, vr_mode, autorotate, num_swap_steps, upsample, imagemask, progress=gr.Progress()):
     from ui.main import prepare_environment
     from roop.core import batch_process_regular
     global is_processing, list_files_process
@@ -604,6 +608,7 @@ def start_swap( enhancer, detection, keep_frames, wait_after_extraction, skip_au
     roop.globals.no_face_action = index_of_no_face_action(no_face_action)
     roop.globals.vr_mode = vr_mode
     roop.globals.autorotate_faces = autorotate
+    roop.globals.subsample_size = int(upsample[:3])
     mask_engine = map_mask_engine(selected_mask_engine, clip_text)
 
     if roop.globals.face_swap_mode == 'selected':