-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathP1getstarted.py
71 lines (52 loc) · 1.99 KB
/
P1getstarted.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from io import open
from nltk import FreqDist
from nltk import WittenBellProbDist
from nltk.util import ngrams
from conllu import parse_incr, TokenList
import pandas as pd
from collections import Counter
corpora = {}
corpora['en'] = 'UD_English-EWT/en_ewt'
corpora['es'] = 'UD_Spanish-GSD/es_gsd'
corpora['nl'] = 'UD_Dutch-Alpino/nl_alpino'
def train_corpus(lang):
return corpora[lang] + '-ud-train.conllu'
def test_corpus(lang):
return corpora[lang] + '-ud-test.conllu'
# Remove contractions such as "isn't".
def prune_sentence(sent):
return [token for token in sent if type(token['id']) is int]
def conllu_corpus(path):
data_file = open(path, 'r', encoding='utf-8')
sents = list(parse_incr(data_file))
return [prune_sentence(sent) for sent in sents]
def get_wordTagList_total_tags_total_tokens(data_set):
words_and_tags = []
tokens_list = []
tags_list = []
for sent in train_sents:
for token in sent:
word_and_tag = [token['form'], token['upos']]
words_and_tags.append(word_and_tag)
tokens_list.append(token['form'])
tags_list.append(token['upos'])
return words_and_tags, tokens_list, tags_list
if __name__ == '__main__':
# Choose language.
lang = 'en'
# Limit length of sentences to avoid underflow.
max_len = 100
train_sents = conllu_corpus(train_corpus(lang))
test_sents = conllu_corpus(test_corpus(lang))
test_sents = [sent for sent in test_sents if len(sent) <= max_len]
print(len(train_sents), 'training sentences')
print(len(test_sents), 'test sentences')
# Illustration how to access the word and the part-of-speech of tokens.
# for sent in train_sents:
# for token in sent:
# print(token['form'], '->', token['upos'], sep='', end=' ')
# print()
# first sentence in the train
pos_tags, tokens, tags = get_wordTagList_total_tags_total_tokens(train_sents)
unique_tags = list(set(tags))
unique_words = list(set(tokens))