gloss2pose.py

import os
import re
from typing import List, Tuple
import numpy as np
import scipy.signal
from scipy.spatial.distance import cdist
from pose_format import Pose
from pose_format.utils.generic import reduce_holistic, correct_wrists, pose_normalization_info
from pose_format.numpy import NumPyPoseBody
from num2words import num2words


# concatenate


def normalize_pose(pose: Pose) -> Pose:
    return pose.normalize(pose_normalization_info(pose.header))


def trim_pose(pose: Pose, start=True, end=True):
    if len(pose.body.data) == 0:
        return pose

    wrist_indexes = [
        pose.header._get_point_index('LEFT_HAND_LANDMARKS', 'WRIST'),
        pose.header._get_point_index('RIGHT_HAND_LANDMARKS', 'WRIST')
    ]
    either_hand = pose.body.confidence[:, 0, wrist_indexes].sum(axis=1) > 0

    first_non_zero_index = np.argmax(either_hand) if start else 0
    last_non_zero_index = (
        len(either_hand) - np.argmax(either_hand[::-1]) - 1) if end else len(either_hand)

    pose.body.data = pose.body.data[first_non_zero_index:last_non_zero_index]
    pose.body.confidence = pose.body.confidence[first_non_zero_index:last_non_zero_index]
    return pose


def concatenate_poses(poses: List[Pose]) -> Pose:
    # print('Reducing poses...')
    poses = [reduce_holistic(p) for p in poses]

    # print('Normalizing poses...')
    poses = [normalize_pose(p) for p in poses]

    # Trim the poses to only include the parts where the hands are visible
    # print('Trimming poses...')
    poses = [trim_pose(p, i > 0, i < len(poses) - 1)
             for i, p in enumerate(poses)]

    # Concatenate all poses
    # print('Smooth concatenating poses...')
    pose = smooth_concatenate_poses(poses)

    # Correct the wrists (should be after smoothing)
    # print('Correcting wrists...')
    pose = correct_wrists(pose)

    # Scale the newly created pose
    # print('Scaling pose...')
    new_width = 512
    shift = 1.25
    shift_vec = np.full(
        shape=(pose.body.data.shape[-1]), fill_value=shift, dtype=np.float32)
    pose.body.data = (pose.body.data + shift_vec) * new_width
    pose.header.dimensions.height = pose.header.dimensions.width = int(
        new_width * shift * 2)

    return pose


# lookup


class PoseLookup:
    def __init__(self, directory: str, language: str):
        with open(os.path.join(directory, 'words.txt'), mode='r', encoding='utf-8') as f:
            words = f.readlines()

        self.glosses = set(word.replace("\n", "") for word in words)
        self.directory = directory
        self.language = language

    def read_pose(self, pose_path: str):
        pose_path = os.path.join(
            self.directory, self.language, pose_path + ".pose")
        with open(pose_path, "rb") as f:
            return Pose.read(f.read())

    def lookup(self, word: str) -> Pose:
        word = word.lower().strip()
        if word in self.glosses:
            return self.read_pose(word)

    def lookup_sequence(self, glosses: List[str]) -> Tuple[List[Pose], List[str]]:
        poses: List[Pose] = []
        words: List[str] = []

        for gloss in glosses:
            pose = self.lookup(gloss)
            if pose:
                poses.append(pose)
                words.append(gloss)
            else:
                for char in gloss:
                    pose = self.lookup(char)
                    if pose:
                        poses.append(pose)
                        words.append(char)

        return poses, words

    def gloss_to_pose(self, glosses: List[str]) -> Tuple[Pose, List[str]]:
        # Transform the list of glosses into a list of poses
        poses, words = self.lookup_sequence(glosses)

        if poses:
            # Concatenate the poses to create a single pose
            return concatenate_poses(poses), words

        return None, None


# smoothing


def pose_savgol_filter(pose: Pose):
    # If we want this to be faster, here is a possible solution
    # https://stackoverflow.com/questions/75221888/fast-savgol-filter-on-3d-tensor/75406720#75406720

    # Smoothing the face does not result in a good result, so we skip it
    [face_component] = [c for c in pose.header.components if c.name == 'FACE_LANDMARKS']
    face_range = range(
        pose.header._get_point_index(
            'FACE_LANDMARKS', face_component.points[0]),
        pose.header._get_point_index(
            'FACE_LANDMARKS', face_component.points[-1]),
    )

    _, _, points, dims = pose.body.data.shape
    for p in range(points):
        if p not in face_range:
            for d in range(dims):
                pose.body.data[:, 0, p, d] = scipy.signal.savgol_filter(
                    pose.body.data[:, 0, p, d], 3, 1)
    return pose


def create_padding(time: float, example: Pose) -> NumPyPoseBody:
    fps = example.body.fps
    padding_frames = int(time * fps)
    data_shape = example.body.data.shape
    return NumPyPoseBody(fps=fps,
                         data=np.zeros(
                             shape=(padding_frames, data_shape[1], data_shape[2], data_shape[3])),
                         confidence=np.zeros(shape=(padding_frames, data_shape[1], data_shape[2])))


def s_concatenate_poses(poses: List[Pose], padding: NumPyPoseBody, interpolation='linear') -> Pose:
    # Add padding to all poses except the last one
    for pose in poses[:-1]:
        pose.body.data = np.concatenate((pose.body.data, padding.data))
        pose.body.confidence = np.concatenate(
            (pose.body.confidence, padding.confidence))

    # Concatenate all tensors
    new_data = np.concatenate([pose.body.data for pose in poses])
    new_conf = np.concatenate([pose.body.confidence for pose in poses])
    new_body = NumPyPoseBody(
        fps=poses[0].body.fps, data=new_data, confidence=new_conf)
    new_body = new_body.interpolate(kind=interpolation)
    return Pose(header=poses[0].header, body=new_body)


def find_best_connection_point(pose1: Pose, pose2: Pose, window=0.3):
    p1_size = int(len(pose1.body.data) * window)
    p2_size = int(len(pose2.body.data) * window)

    last_data = pose1.body.data[len(pose1.body.data) - p1_size:]
    first_data = pose2.body.data[:p2_size]

    last_vectors = last_data.reshape(len(last_data), -1)
    first_vectors = first_data.reshape(len(first_data), -1)

    distances_matrix = cdist(last_vectors, first_vectors, 'euclidean')
    min_index = np.unravel_index(
        np.argmin(distances_matrix, axis=None), distances_matrix.shape)
    last_index = len(pose1.body.data) - p1_size + min_index[0]
    return last_index, min_index[1]


def smooth_concatenate_poses(poses: List[Pose], padding=0.20) -> Pose:
    if len(poses) == 1:
        return poses[0]

    start = 0
    for i, pose in enumerate(poses):
        # print('Processing', i + 1, 'of', len(poses), '...')
        if i != len(poses) - 1:
            end, next_start = find_best_connection_point(
                poses[i], poses[i + 1])
        else:
            end = len(pose.body.data)
            next_start = None

        pose.body = pose.body[start:end]
        start = next_start

    padding_pose = create_padding(padding, poses[0])
    # print('Concatenating...')
    single_pose = s_concatenate_poses(poses, padding_pose)
    # print('Smoothing...')
    return pose_savgol_filter(single_pose)


# utils

def scale_down(pose: Pose, value: int = 256):
    scale = pose.header.dimensions.width / value
    pose.header.dimensions.width = int(pose.header.dimensions.width / scale)
    pose.header.dimensions.height = int(pose.header.dimensions.height / scale)
    pose.body.data = pose.body.data / scale


def scale_up(pose: Pose, value: int = 2):
    pose.body.data *= value
    pose.header.dimensions.width *= value
    pose.header.dimensions.height *= value


def prepare_glosses(sentence: str) -> List[str]:
    glosses: List[str] = re.findall(r'\b[a-zA-Z0-9]+\b', sentence.lower())

    for i, word in enumerate(glosses):
        if word.isdigit():
            number_words = num2words(int(word)).split()
            glosses[i:i+1] = number_words

    return glosses