-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalignment_pipeline_distributed.sh
executable file
·119 lines (101 loc) · 4.28 KB
/
alignment_pipeline_distributed.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
if test $# -ne 2
then
echo "Usage:$0 <date> <id>"
exit
fi
date=$1
id=$2
now=$(/bin/date +%Y-%m-%d-%H-%M)
audiofile=audio/${date}.flac
if [ ! -e $audiofile ]
then
echo "audiofile not found"
exit
fi
key=$(cat key.txt)
echo "Downloading transcript:"
curl -X GET https://api-dot-spraklabben.appspot.com/transcripts/${id}/paragraphs -H 'Content-Type: application/json' -H 'Postman-Token: 2ae3e85b-519a-42ce-b42a-dc0dcd8e223f' -H 'cache-control: no-cache' --user $key > intermediary/GET_${date}_${now}.json
transcription=intermediary/GET_${date}_${now}.json
cp $transcription archive/GET_${date}_${now}.json
if [ -e $transcription ]
then
cp $transcription non-aligned
else
echo "Transcription not found"
exit
fi
echo "Finding head and tail of audio file"
head_tail=$(python get_head_tail.py $audiofile $transcription)
head=$(echo $head_tail | grep -oP "^\d*\.\d*")
tail=$(echo $head_tail | grep -oP "\d*\.\d*$")
echo "head: $head; tail: $tail"
echo "Converting transcription to lines"
python json_to_lines.py $transcription
lines=$transcription.lines.txt
cp $lines archive/GET_${date}_${now}.json.lines.txt
echo "Running forced alignment"
#if multilevel
#python -m aeneas.tools.execute_task $audiofile $lines -r="mfcc_window_length=0.15|mfcc_window_shift=0.05|mfcc_size=25" "task_language=nor|is_text_type=mplain|os_task_file_format=json|is_audio_file_head_length=$head|is_audio_file_tail_length=$tail|task_adjust_boundary_algorithm=beforenext|task_adjust_boundary_beforenext_value=0.050" intermediary/AENEAS_${date}_${now}.json
#sentence only
python -m aeneas.tools.execute_task $audiofile $lines \
-r="mfcc_window_length=0.15|mfcc_window_shift=0.05|mfcc_size=25" \
"task_language=nor|is_text_type=plain|os_task_file_format=json|is_audio_file_head_length=$head|is_audio_file_tail_length=$tail|task_adjust_boundary_algorithm=beforenext|task_adjust_boundary_beforenext_value=0.050" \
intermediary/AENEAS_${date}_${now}.json
aeneas=intermediary/AENEAS_${date}_${now}.json
cp $aeneas archive/AENEAS_${date}_${now}.json
if [ -e $aeneas ]
then
echo "Creating sent-aligned transcription"
python align.py $transcription $aeneas intermediary/SENT_ALIGNED_${date}_${now}.json
new_transcription=intermediary/SENT_ALIGNED_${date}_${now}.json
else
echo "Alignmentfile not found"
exit
fi
if [ -e $new_transcription ]
then
echo "Splitting audiofile"
python chopper.py $new_transcription $audiofile
else
echo "Transcription not found"
exit
fi
slices=$(ls intermediary | grep -Po "(?<=SENT_ALIGNED_${date}_${now}.json.lines.)\d*_\d*" | sort)
filelist=''
for elem in $slices
do
audioslice=intermediary/SENT_ALIGNED_${date}_${now}.json.sound.${elem}.flac
lineslice=intermediary/SENT_ALIGNED_${date}_${now}.json.lines.${elem}.txt
if [ -e $audioslice ] && [ -e $lineslice ]
then
echo "Running forced alignment on $audioslice"
python -m aeneas.tools.execute_task $audioslice $lineslice -r="mfcc_window_length=0.15|mfcc_window_shift=0.05|mfcc_size=25" \
"task_language=nor|is_text_type=mplain|os_task_file_format=json|task_adjust_boundary_algorithm=beforenext|task_adjust_boundary_beforenext_value=0.050" \
intermediary/WORD_ALIGNED_${date}_${now}_${elem}_aeneas.json
currentfile=intermediary/WORD_ALIGNED_${date}_${now}_${elem}_aeneas.json
if [ -e $currentfile ]
then
filelist+="$currentfile,"
else
echo "Fail at $currentfile"
exit
fi
else
echo "$audioslice or $lineslice does not exist"
exit
fi
done
echo "Creating word-aligned transcription"
python merge_transcriptions.py $new_transcription $filelist aligned/WORD_ALIGNED_${date}_${now}.json
word_aligned=aligned/WORD_ALIGNED_${date}_${now}.json
if [ -e $word_aligned ]
then
cp $word_aligned archive/
python view_diffs.py $new_transcription $word_aligned | more
#echo "Pushing aligned transcription to Språklabben"
#curl -X PUT https://api-dot-spraklabben.appspot.com/transcripts/${id}/paragraphs -H 'Content-Type: application/json' -H 'Postman-Token: 5e8937eb-9f2d-4af4-8d53-9ccc073fb8a2' -H 'cache-control: no-cache' --user $key --data-binary "@${new_transcription}" > curl_output/PUT_word_aligned_${date}_${now}.json
else
echo "$word_aligned does not exist"
exit
fi