align.py

#!/usr/bin/env python
# coding=utf-8

import sys
import os
import json
from aeneas.executetask import ExecuteTask
from aeneas.task import Task
from aeneas.textfile import TextFile
from aeneas.language import Language
from aeneas.textfile import TextFragment
from json_to_lines import get_par_lines
from audioread import audio_open
from word_time_distribution import distribute_words
from read_aeneas_json import parse_aeneas

def total_FA(soundfile, mylines, myhead, mytail, config=None):
    """Runs Aeneas as a library. This function isn't in use, currently,
    as we haven't managed to get reliable results in this way."""
    # create Task object
    if config is None:
        config_string = (u"task_language=nor|is_text_type=plain|os_task_file_format=json|is_audio_file_head_length=%s|is_audio_file_tail_length=%s" % (myhead, mytail))
        print(config_string)
    else:
        config_string = (u"task_language=nor|is_text_type=plain|os_task_file_format=json|is_audio_file_head_length=%s|is_audio_file_tail_length=%s|%s" % (myhead, mytail, config))
        print(config_string)
    task = Task(config_string=config_string)
    print(task)
    task.audio_file_path_absolute = soundfile
    textfile = TextFile()
    print(textfile)
    #task.sync_map_file_path_absolute = outfile
    for identifier, frag_text in mylines:
        textfile.add_fragment(TextFragment(identifier, Language.NOR, frag_text, frag_text))
    task.text_file = textfile
    print(len(task.text_file))
    ExecuteTask(task).execute()
    syncmaplist = task.sync_map.fragments
    return syncmaplist


def compute_alignments(soundfile, asr_dict, config=None):
    """Creates a list of dicts with paragraph ids and timecodes
    when running Aeneas as a library. Since we haven't managed to
    run Aeneas as a library, this function is not in use. In the
    alternative pipeline where Aeneas is run in the terminal, this
    function is replaced by read_aeneas_json.parse_aeneas"""
    timecodelist = []
    with audio_open(soundfile) as sf:
        duration = sf.duration
    bil = 1000000000
    paragraphs = get_par_lines(asr_dict)
    speechstart = paragraphs[0]['start']/bil
    speechend = paragraphs[-1]['end']/bil
    allines = [(x['id'], [x['string']]) for x in paragraphs]
    head = speechstart-3
    tail = duration-speechend-3
    align = total_FA(soundfile, allines, head, tail, config)
    alignOnlySpeech = align[1:-1]
    timecodelist = [{'id': paragraphs[n]['id'], 'start': int(alignOnlySpeech[n].begin*bil), 'end': int(alignOnlySpeech[n].end*bil)} for n in range(len(alignOnlySpeech))]
    return timecodelist


def realign_json(googledict, aeneasdict):
    """Takes as input a Google Cloud StT transcription and the same transcription
    forcefully aligned with Aeneas, both in the form of loaded json files. Returns
    a dict, compatible with Google Cloud StT and Språklabben, which corresponds to
    the Google transcriptions, but with the start and end timecodes from Aeneas, and
    with the timecodes of all other words generated by the heuristics in
    word_time_distributrion.distribute words."""
    alignments = parse_aeneas(googledict, aeneasdict)
    returndict = {'paragraphs': []}
    for n in range(len(googledict['paragraphs'])):
        par = googledict['paragraphs'][n]
        mydict = {}
        mydict['id'] = par['id']
        mydict['speaker'] = par['speaker']
        mydict['startTime'] = alignments[n]['start']
        mydict['words'] = par['words']
        mydict['words'][0]['startTime'] = alignments[n]['start']
        mydict['words'][-1]['endTime'] = alignments[n]['end']
        mydict['words'] = distribute_words(mydict['words'])
        returndict['paragraphs'].append(mydict)
    return returndict

    
if __name__ == "__main__":    
    try:
        googlejson = sys.argv[1]
        aeneasjson = sys.argv[2]
        outfile = sys.argv[3]
    except IndexError:
        sys.exit("Please provide filenames: python align.py googlejson aeneasjson outfile")

    with open(googlejson, 'r') as google:
        googledict = json.load(google)

    with open(aeneasjson, 'r') as aeneas:
        aeneasdict = json.load(aeneas)    


    newdict = realign_json(googledict, aeneasdict)

    with open(outfile, 'w') as out:
        json.dump(newdict, out, ensure_ascii=False)