construção do corpus DHBB.
execute udpipe for each raw that has an json in ner folder:
for f in ner/*.json; do echo raw/$(basename $f .json).raw; done | xargs ~/work/udpipe-1.2.0/bin-osx/udpipe --outfile=udp/{}.conllu --tokenizer="normalized_spaces;ranges" --tag --parse ~/work/udpipe-1.2.0/models/portuguese-bosque-ud-2.5-191206.udpipe
Melhor usar o ner/nlu.py
for a sample of files in RAW:
for f in raw/?.raw; do
curl -G --header "Content-Type: application/json" -u SECRET --data-urlencode "text@$f" "https://api.us-south.natural-language-understanding.watson.cloud.ibm.com/instances/a9eda6db-309d-4e9f-8454-0464bbbf7575/v1/analyze?version=2020-08-01&features=entities,relations&entities.model=073dab23-dd1e-4ded-badf-f502eb06372c&entities.mentions=true&&return_analyzed_text=true" > ner/$(basename $f .raw).json;
done
for each document in wks:
for f in `cat wks/documents.json| jq -r '.[] | .name'` ; do
IN=$(basename $f .txt);
echo Processing $IN;
curl -X POST -G --header "Content-Type: application/json" -u $KEY --data-urlencode "text@../raw/$IN.raw" "$URL/v1/analyze?version=2020-08-01&features=entities,relations&entities.model=$MODEL&entities.mentions=true&&return_analyzed_text=true" > $IN.json;
done
Para comparar ner/*.json com respectivos ner/wks/gt/*.json:
cat wks/documents.json| jq -r '.[] | "echo Processing " + .name + "\ntest-ner -c " + .name + ".json wks/gt/" + .id + ".json"' > temp.sh
Para importar novos arquivos para o WKS:
jq '.[] | .name | "raw/" + rtrimstr(".txt") + ".raw" ' -r ner/wks/documents.json > a.list
wc -l raw/*.raw | awk '$1 < 7 && $1 > 2 {print $2}' > b.list
diff --new-line-format="" --unchanged-line-format="" <(sort b.list) <(sort a.list) | shuf | head -50 > n.list
for f in `cat n.list`; do cp $f tmp/`basename $f .raw`.txt; done
rm a.list b.list n.list
for f in sents/*.sent; do echo $f; done | xargs ~/work/udpipe-1.2.0/bin-osx/udpipe --outfile=udp-mini/{}.conllu --tokenizer="normalized_spaces;presegmented" --tag --parse ~/work/udpipe-1.2.0/models/portuguese-bosque-ud-2.5-191206.udpipe