diff --git a/README.md b/README.md index d49580a79..f7ab9d4c4 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,12 @@ Additional commandline arguments are currently unsupported and settings should b ### Changelog +**14.07.2024** v4.1.0 + +- Added subsample upscaling to increase swap resolution +- Upgraded gradio + + **22.04.2024** v3.9.0 - Bugfix: Face detection bounding box corrupt values at weird angles diff --git a/requirements.txt b/requirements.txt index a54390985..adec4abff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cu118 numpy==1.26.4 -gradio==4.32.1 +gradio==4.38.1 opencv-python==4.9.0.80 onnx==1.16.0 insightface==0.7.3 diff --git a/roop/ProcessMgr.py b/roop/ProcessMgr.py index 373d42997..7aae03f06 100644 --- a/roop/ProcessMgr.py +++ b/roop/ProcessMgr.py @@ -6,7 +6,7 @@ from enum import Enum from roop.ProcessOptions import ProcessOptions -from roop.face_util import get_first_face, get_all_faces, rotate_image_180, rotate_anticlockwise, rotate_clockwise, clamp_cut_values +from roop.face_util import get_first_face, get_all_faces, rotate_anticlockwise, rotate_clockwise, clamp_cut_values from roop.utilities import compute_cosine_distance, get_device, str_to_class import roop.vr_util as vr @@ -20,6 +20,7 @@ import roop.globals + # Poor man's enum to be able to compare to int class eNoFaceAction(): USE_ORIGINAL_FRAME = 0 @@ -44,6 +45,7 @@ def pick_queue(queue: Queue[str], queue_per_future: int) -> List[str]: return queues + class ProcessMgr(): input_face_datas = [] target_face_datas = [] @@ -317,11 +319,6 @@ def update_progress(self, progress: Any = None) -> None: self.progress_gradio((progress.n, self.total_frames), desc='Processing', total=self.total_frames, unit='frames') -# https://github.com/deepinsight/insightface#third-party-re-implementation-of-arcface -# https://github.com/deepinsight/insightface/blob/master/alignment/coordinate_reg/image_infer.py -# https://github.com/deepinsight/insightface/issues/1350 -# https://github.com/linghu8812/tensorrt_inference - def process_frame(self, frame:Frame): if len(self.input_face_datas) < 1 and not self.options.show_face_masking: @@ -541,17 +538,30 @@ def process_face(self,face_index, target_face:Face, frame:Frame): # img = vr.GetPerspective(frame, 90, theta, phi, 1280, 1280) # Generate perspective image - fake_frame = None - aligned_img, M = align_crop(frame, target_face.kps, 128) + + """ Code ported/adapted from Facefusion which borrowed the idea from Rope: + Kind of subsampling the cutout and aligned face image and faceswapping slices of it up to + the desired output resolution. This works around the current resolution limitations without using enhancers. + """ + model_output_size = 128 + subsample_size = self.options.subsample_size + subsample_total = subsample_size // model_output_size + aligned_img, M = align_crop(frame, target_face.kps, subsample_size) + fake_frame = aligned_img - swap_frame = aligned_img target_face.matrix = M + for p in self.processors: if p.type == 'swap': - if inputface is not None: + swap_result_frames = [] + subsample_frames = self.implode_pixel_boost(aligned_img, model_output_size, subsample_total) + for sliced_frame in subsample_frames: for _ in range(0,self.options.num_swap_steps): - swap_frame = p.Run(inputface, target_face, swap_frame) - fake_frame = swap_frame + sliced_frame = self.prepare_crop_frame(sliced_frame) + sliced_frame = p.Run(inputface, target_face, sliced_frame) + sliced_frame = self.normalize_swap_frame(sliced_frame) + swap_result_frames.append(sliced_frame) + fake_frame = self.explode_pixel_boost(swap_result_frames, model_output_size, subsample_total, subsample_size) scale_factor = 0.0 elif p.type == 'mask': fake_frame = self.process_mask(p, aligned_img, fake_frame) @@ -560,8 +570,8 @@ def process_face(self,face_index, target_face:Face, frame:Frame): upscale = 512 orig_width = fake_frame.shape[1] - - fake_frame = cv2.resize(fake_frame, (upscale, upscale), cv2.INTER_CUBIC) + if orig_width != upscale: + fake_frame = cv2.resize(fake_frame, (upscale, upscale), cv2.INTER_CUBIC) mask_offsets = (0,0,0,0,1,20) if inputface is None else inputface.mask_offsets @@ -673,6 +683,43 @@ def blur_area(self, img_matte, num_erosion_iterations, blur_amount): return cv2.GaussianBlur(img_matte, blur_size, 0) + def prepare_crop_frame(self, swap_frame): + model_type = 'inswapper' + model_mean = [0.0, 0.0, 0.0] + model_standard_deviation = [1.0, 1.0, 1.0] + + if model_type == 'ghost': + swap_frame = swap_frame[:, :, ::-1] / 127.5 - 1 + else: + swap_frame = swap_frame[:, :, ::-1] / 255.0 + swap_frame = (swap_frame - model_mean) / model_standard_deviation + swap_frame = swap_frame.transpose(2, 0, 1) + swap_frame = np.expand_dims(swap_frame, axis = 0).astype(np.float32) + return swap_frame + + + def normalize_swap_frame(self, swap_frame): + model_type = 'inswapper' + swap_frame = swap_frame.transpose(1, 2, 0) + + if model_type == 'ghost': + swap_frame = (swap_frame * 127.5 + 127.5).round() + else: + swap_frame = (swap_frame * 255.0).round() + swap_frame = swap_frame[:, :, ::-1] + return swap_frame + + def implode_pixel_boost(self, aligned_face_frame, model_size, pixel_boost_total : int): + subsample_frame = aligned_face_frame.reshape(model_size, pixel_boost_total, model_size, pixel_boost_total, 3) + subsample_frame = subsample_frame.transpose(1, 3, 0, 2, 4).reshape(pixel_boost_total ** 2, model_size, model_size, 3) + return subsample_frame + + + def explode_pixel_boost(self, subsample_frame, model_size, pixel_boost_total, pixel_boost_size): + final_frame = np.stack(subsample_frame, axis = 0).reshape(pixel_boost_total, pixel_boost_total, model_size, model_size, 3) + final_frame = final_frame.transpose(2, 0, 3, 1, 4).reshape(pixel_boost_size, pixel_boost_size, 3) + return final_frame + def process_mask(self, processor, frame:Frame, target:Frame): img_mask = processor.Run(frame, self.options.masking_text) img_mask = cv2.resize(img_mask, (target.shape[1], target.shape[0])) diff --git a/roop/ProcessOptions.py b/roop/ProcessOptions.py index 296e8b243..4f5e2b99c 100644 --- a/roop/ProcessOptions.py +++ b/roop/ProcessOptions.py @@ -1,6 +1,6 @@ class ProcessOptions: - def __init__(self, processordefines:dict, face_distance, blend_ratio, swap_mode, selected_index, masking_text, imagemask, num_steps, show_face_area, show_mask=False): + def __init__(self, processordefines:dict, face_distance, blend_ratio, swap_mode, selected_index, masking_text, imagemask, num_steps, subsample_size, show_face_area, show_mask=False): self.processors = processordefines self.face_distance_threshold = face_distance self.blend_ratio = blend_ratio @@ -10,4 +10,5 @@ def __init__(self, processordefines:dict, face_distance, blend_ratio, swap_mode self.imagemask = imagemask self.num_swap_steps = num_steps self.show_face_area_overlay = show_face_area - self.show_face_masking = show_mask \ No newline at end of file + self.show_face_masking = show_mask + self.subsample_size = subsample_size \ No newline at end of file diff --git a/roop/core.py b/roop/core.py index 7cc86d7bd..e5945f842 100755 --- a/roop/core.py +++ b/roop/core.py @@ -214,7 +214,9 @@ def batch_process_regular(files:list[ProcessEntry], masking_engine:str, new_clip mask = imagemask["layers"][0] if imagemask is not None else None if len(roop.globals.INPUT_FACESETS) <= selected_index: selected_index = 0 - options = ProcessOptions(get_processing_plugins(masking_engine), roop.globals.distance_threshold, roop.globals.blend_ratio, roop.globals.face_swap_mode, selected_index, new_clip_text, mask, num_swap_steps, False) + options = ProcessOptions(get_processing_plugins(masking_engine), roop.globals.distance_threshold, roop.globals.blend_ratio, + roop.globals.face_swap_mode, selected_index, new_clip_text, mask, num_swap_steps, + roop.globals.subsample_size, False) process_mgr.initialize(roop.globals.INPUT_FACESETS, roop.globals.TARGET_FACES, options) batch_process(files, use_new_method) return diff --git a/roop/face_util.py b/roop/face_util.py index 6abd712d5..91c5a3e83 100644 --- a/roop/face_util.py +++ b/roop/face_util.py @@ -210,15 +210,18 @@ def rotate_image_180(image): ) -def estimate_norm(lmk, image_size=112, mode="arcface"): +def estimate_norm(lmk, image_size=112): assert lmk.shape == (5, 2) - assert image_size % 112 == 0 or image_size % 128 == 0 if image_size % 112 == 0: ratio = float(image_size) / 112.0 diff_x = 0 - else: + elif image_size % 128 == 0: ratio = float(image_size) / 128.0 diff_x = 8.0 * ratio + elif image_size % 512 == 0: + ratio = float(image_size) / 512.0 + diff_x = 32.0 * ratio + dst = arcface_dst * ratio dst[:, 0] += diff_x tform = trans.SimilarityTransform() @@ -230,7 +233,7 @@ def estimate_norm(lmk, image_size=112, mode="arcface"): # aligned, M = norm_crop2(f[1], face.kps, 512) def align_crop(img, landmark, image_size=112, mode="arcface"): - M = estimate_norm(landmark, image_size, mode) + M = estimate_norm(landmark, image_size) warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0) return warped, M diff --git a/roop/globals.py b/roop/globals.py index b1228e3d0..d259f611a 100644 --- a/roop/globals.py +++ b/roop/globals.py @@ -26,6 +26,7 @@ headless = None log_level = 'error' selected_enhancer = None +subsample_size = 128 face_swap_mode = None blend_ratio = 0.5 distance_threshold = 0.65 diff --git a/roop/metadata.py b/roop/metadata.py index 469e3990c..890c155c9 100644 --- a/roop/metadata.py +++ b/roop/metadata.py @@ -1,2 +1,2 @@ name = 'roop unleashed' -version = '4.0.0' +version = '4.1.0' diff --git a/roop/processors/FaceSwapInsightFace.py b/roop/processors/FaceSwapInsightFace.py index 34290899f..807db22cf 100644 --- a/roop/processors/FaceSwapInsightFace.py +++ b/roop/processors/FaceSwapInsightFace.py @@ -36,26 +36,18 @@ def Initialize(self, plugin_options:dict): self.model_swap_insightface = onnxruntime.InferenceSession(model_path, sess_options, providers=roop.globals.execution_providers) - + def Run(self, source_face: Face, target_face: Face, temp_frame: Frame) -> Frame: - blob = cv2.dnn.blobFromImage(temp_frame, 1.0 / self.input_std, (128, 128), - (self.input_mean, self.input_mean, self.input_mean), swapRB=True) latent = source_face.normed_embedding.reshape((1,-1)) latent = np.dot(latent, self.emap) latent /= np.linalg.norm(latent) io_binding = self.model_swap_insightface.io_binding() - io_binding.bind_cpu_input("target", blob) + io_binding.bind_cpu_input("target", temp_frame) io_binding.bind_cpu_input("source", latent) io_binding.bind_output("output", self.devicename) self.model_swap_insightface.run_with_iobinding(io_binding) ort_outs = io_binding.copy_outputs_to_cpu()[0] - img_fake = ort_outs.transpose((0,2,3,1))[0] - return np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1] - - - img_fake, M = self.model_swap_insightface.get(temp_frame, target_face, source_face, paste_back=False) - # target_face.matrix = M - # return img_fake + return ort_outs[0] def Release(self): diff --git a/roop/virtualcam.py b/roop/virtualcam.py index d429851bb..490dacba8 100644 --- a/roop/virtualcam.py +++ b/roop/virtualcam.py @@ -44,10 +44,11 @@ def virtualcamera(streamobs, cam_num,width,height): print(f'Using {cam.native_fmt}') else: print(f'Not streaming to virtual camera!') + subsample_size = roop.globals.subsample_size # always use xseg masking options = ProcessOptions(get_processing_plugins("mask_xseg"), roop.globals.distance_threshold, roop.globals.blend_ratio, - "all", 0, None, None, 1, False) + "all", 0, None, None, 1, subsample_size, False) while cam_active: ret, frame = cap.read() if not ret: diff --git a/ui/globals.py b/ui/globals.py index 5514a63d6..3c6d2d1c9 100644 --- a/ui/globals.py +++ b/ui/globals.py @@ -4,6 +4,7 @@ ui_SELECTED_INPUT_FACE_INDEX = 0 ui_selected_enhancer = None +ui_upscale = None ui_blend_ratio = None ui_input_thumbs = [] ui_target_thumbs = [] diff --git a/ui/main.py b/ui/main.py index 847fbe83e..cb13cc0dd 100644 --- a/ui/main.py +++ b/ui/main.py @@ -57,7 +57,7 @@ def run(): if server_port <= 0: server_port = None ssl_verify = False if server_name == '0.0.0.0' else True - with gr.Blocks(title=f'{roop.metadata.name} {roop.metadata.version}', theme=roop.globals.CFG.selected_theme, css=mycss) as ui: + with gr.Blocks(title=f'{roop.metadata.name} {roop.metadata.version}', theme=roop.globals.CFG.selected_theme, css=mycss, delete_cache=(60, 86400)) as ui: with gr.Row(variant='compact'): gr.Markdown(f"### [{roop.metadata.name} {roop.metadata.version}](https://github.com/C0untFloyd/roop-unleashed)") gr.HTML(util.create_version_html(), elem_id="versions") diff --git a/ui/tabs/extras_tab.py b/ui/tabs/extras_tab.py index 768654264..c519ae0cf 100644 --- a/ui/tabs/extras_tab.py +++ b/ui/tabs/extras_tab.py @@ -175,7 +175,7 @@ def on_frame_process(files, filterselection, upscaleselection): filter = next((x for x in frame_upscalers_map.keys() if x == upscaleselection), None) if filter is not None: processoroptions.update(frame_upscalers_map[filter]) - options = ProcessOptions(processoroptions, 0, 0, "all", 0, None, None, None, False) + options = ProcessOptions(processoroptions, 0, 0, "all", 0, None, None, 0, 128, False) batch_process_with_options(list_files_process, options, None) outdir = pathlib.Path(roop.globals.output_path) outfiles = [str(item) for item in outdir.rglob("*") if item.is_file()] diff --git a/ui/tabs/faceswap_tab.py b/ui/tabs/faceswap_tab.py index 97ec1fab4..3d08dd448 100644 --- a/ui/tabs/faceswap_tab.py +++ b/ui/tabs/faceswap_tab.py @@ -99,13 +99,15 @@ def faceswap_tab(): with gr.Column(scale=1): selected_face_detection = gr.Dropdown(["First found", "All female", "All male", "All faces", "Selected face"], value="First found", label="Specify face selection for swapping") with gr.Column(scale=1): + num_swap_steps = gr.Slider(1, 5, value=1, step=1.0, label="Number of swapping steps", info="More steps may increase likeness") + with gr.Column(scale=2): ui.globals.ui_selected_enhancer = gr.Dropdown(["None", "Codeformer", "DMDNet", "GFPGAN", "GPEN", "Restoreformer++"], value="None", label="Select post-processing") with gr.Row(variant='panel'): with gr.Column(scale=1): max_face_distance = gr.Slider(0.01, 1.0, value=0.65, label="Max Face Similarity Threshold", info="0.0 = identical 1.0 = no similarity") with gr.Column(scale=1): - num_swap_steps = gr.Slider(1, 5, value=1, step=1.0, label="Number of swapping steps", info="More steps can increase likeness") + ui.globals.ui_upscale = gr.Dropdown(["128px", "256px", "512px"], value="128px", label="Subsample upscale to", interactive=True) with gr.Column(scale=2): ui.globals.ui_blend_ratio = gr.Slider(0.0, 1.0, value=0.65, label="Original/Enhanced image blend ratio", info="Only used with active post-processing") @@ -140,7 +142,7 @@ def faceswap_tab(): resultvideo = gr.Video(label='Final Video', interactive=False, visible=False) previewinputs = [preview_frame_num, bt_destfiles, fake_preview, ui.globals.ui_selected_enhancer, selected_face_detection, - max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text, no_face_action, vr_mode, autorotate, maskimage, chk_showmaskoffsets, num_swap_steps] + max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text, no_face_action, vr_mode, autorotate, maskimage, chk_showmaskoffsets, num_swap_steps, ui.globals.ui_upscale] previewoutputs = [previewimage, maskimage, preview_frame_num] input_faces.select(on_select_input_face, None, None).then(fn=on_preview_frame_changed, inputs=previewinputs, outputs=previewoutputs) bt_remove_selected_input_face.click(fn=remove_selected_input_face, outputs=[input_faces]) @@ -176,7 +178,7 @@ def faceswap_tab(): start_event = bt_start.click(fn=start_swap, inputs=[ui.globals.ui_selected_enhancer, selected_face_detection, roop.globals.keep_frames, roop.globals.wait_after_extraction, - roop.globals.skip_audio, max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text,video_swapping_method, no_face_action, vr_mode, autorotate, num_swap_steps, maskimage], + roop.globals.skip_audio, max_face_distance, ui.globals.ui_blend_ratio, selected_mask_engine, clip_text,video_swapping_method, no_face_action, vr_mode, autorotate, num_swap_steps, ui.globals.ui_upscale, maskimage], outputs=[bt_start, bt_stop, resultfiles], show_progress='full') after_swap_event = start_event.then(fn=on_resultfiles_finished, inputs=[resultfiles], outputs=[resultimage, resultvideo]) @@ -407,7 +409,7 @@ def on_end_face_selection(): def on_preview_frame_changed(frame_num, files, fake_preview, enhancer, detection, face_distance, blend_ratio, - selected_mask_engine, clip_text, no_face_action, vr_mode, auto_rotate, maskimage, show_face_area, num_steps): + selected_mask_engine, clip_text, no_face_action, vr_mode, auto_rotate, maskimage, show_face_area, num_steps, upsample): global SELECTED_INPUT_FACE_INDEX, manual_masking, current_video_fps from roop.core import live_swap, get_processing_plugins @@ -454,6 +456,8 @@ def on_preview_frame_changed(frame_num, files, fake_preview, enhancer, detection roop.globals.no_face_action = index_of_no_face_action(no_face_action) roop.globals.vr_mode = vr_mode roop.globals.autorotate_faces = auto_rotate + roop.globals.subsample_size = int(upsample[:3]) + mask_engine = map_mask_engine(selected_mask_engine, clip_text) @@ -464,7 +468,7 @@ def on_preview_frame_changed(frame_num, files, fake_preview, enhancer, detection face_index = 0 options = ProcessOptions(get_processing_plugins(mask_engine), roop.globals.distance_threshold, roop.globals.blend_ratio, - roop.globals.face_swap_mode, face_index, clip_text, maskimage, num_steps, show_face_area) + roop.globals.face_swap_mode, face_index, clip_text, maskimage, num_steps, roop.globals.subsample_size, show_face_area) current_frame = live_swap(current_frame, options) if current_frame is None: @@ -538,7 +542,7 @@ def on_preview_mask(frame_num, files, clip_text, mask_engine): elif mask_engine == "DFL XSeg": mask_engine = "mask_xseg" options = ProcessOptions(get_processing_plugins(mask_engine), roop.globals.distance_threshold, roop.globals.blend_ratio, - "all", 0, clip_text, None, 0, False, True) + "all", 0, clip_text, None, 0, 128, False, True) current_frame = live_swap(current_frame, options) return util.convert_to_gradio(current_frame) @@ -576,7 +580,7 @@ def translate_swap_mode(dropdown_text): def start_swap( enhancer, detection, keep_frames, wait_after_extraction, skip_audio, face_distance, blend_ratio, - selected_mask_engine, clip_text, processing_method, no_face_action, vr_mode, autorotate, num_swap_steps, imagemask, progress=gr.Progress()): + selected_mask_engine, clip_text, processing_method, no_face_action, vr_mode, autorotate, num_swap_steps, upsample, imagemask, progress=gr.Progress()): from ui.main import prepare_environment from roop.core import batch_process_regular global is_processing, list_files_process @@ -604,6 +608,7 @@ def start_swap( enhancer, detection, keep_frames, wait_after_extraction, skip_au roop.globals.no_face_action = index_of_no_face_action(no_face_action) roop.globals.vr_mode = vr_mode roop.globals.autorotate_faces = autorotate + roop.globals.subsample_size = int(upsample[:3]) mask_engine = map_mask_engine(selected_mask_engine, clip_text) if roop.globals.face_swap_mode == 'selected': diff --git a/ui/tabs/livecam_tab.py b/ui/tabs/livecam_tab.py index a5b5a228f..9115d39f2 100644 --- a/ui/tabs/livecam_tab.py +++ b/ui/tabs/livecam_tab.py @@ -29,17 +29,18 @@ def livecam_tab(): with gr.Row(): fake_cam_image = gr.Image(label='Fake Camera Output', interactive=False) - start_event = bt_start.click(fn=start_cam, inputs=[cb_obs, camera_num, dd_reso, ui.globals.ui_selected_enhancer, ui.globals.ui_blend_ratio],outputs=[bt_start, bt_stop,fake_cam_image]) + start_event = bt_start.click(fn=start_cam, inputs=[cb_obs, camera_num, dd_reso, ui.globals.ui_selected_enhancer, ui.globals.ui_blend_ratio, ui.globals.ui_upscale],outputs=[bt_start, bt_stop,fake_cam_image]) bt_stop.click(fn=stop_swap, cancels=[start_event], outputs=[bt_start, bt_stop], queue=False) -def start_cam(stream_to_obs, cam, reso, enhancer, blend_ratio): +def start_cam(stream_to_obs, cam, reso, enhancer, blend_ratio, upscale): from roop.virtualcam import start_virtual_cam from roop.utilities import convert_to_gradio start_virtual_cam(stream_to_obs, cam, reso) roop.globals.selected_enhancer = enhancer roop.globals.blend_ratio = blend_ratio + roop.globals.subsample_size = int(upscale[:3]) while True: yield gr.Button(interactive=False), gr.Button(interactive=True), convert_to_gradio(ui.globals.ui_camera_frame) diff --git a/ui/tabs/settings_tab.py b/ui/tabs/settings_tab.py index f1b34e91a..ac82076d3 100644 --- a/ui/tabs/settings_tab.py +++ b/ui/tabs/settings_tab.py @@ -102,14 +102,13 @@ def on_settings_changed(evt: gr.SelectData): def clean_temp(): from ui.main import prepare_environment - if not roop.globals.CFG.use_os_temp_folder: - shutil.rmtree(os.environ["TEMP"]) - prepare_environment() - ui.globals.ui_input_thumbs.clear() roop.globals.INPUT_FACESETS.clear() roop.globals.TARGET_FACES.clear() ui.globals.ui_target_thumbs = [] + if not roop.globals.CFG.use_os_temp_folder: + shutil.rmtree(os.environ["TEMP"]) + prepare_environment() gr.Info('Temp Files removed') return None,None,None,None