forked from epfml/sent2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikiTokenize.py
67 lines (53 loc) · 1.84 KB
/
wikiTokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/python3
import sys
from nltk.tokenize import StanfordTokenizer
import re
import os
def tokenize(tknzr, sentence, to_lower=True):
"""Arguments:
- tknzr: a tokenizer implementing the NLTK tokenizer interface
- sentence: a string to be tokenized
- to_lower: lowercasing or not
"""
sentence = sentence.strip()
sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
if to_lower:
sentence = sentence.lower()
sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',sentence) #replace urls by <url>
sentence = re.sub('(\@ [^\s]+)','<user>',sentence) #replace @user268 by <user>
filter(lambda word: ' ' not in word, sentence)
return sentence
def format_token(token):
""""""
if token == '-LRB-':
token = '('
elif token == '-RRB-':
token = ')'
elif token == '-RSB-':
token = ']'
elif token == '-LSB-':
token = '['
elif token == '-LCB-':
token = '{'
elif token == '-RCB-':
token = '}'
return token
def tokenize_sentences(tknzr, sentences, to_lower=True):
"""Arguments:
- tknzr: a tokenizer implementing the NLTK tokenizer interface
- sentences: a list of sentences
- to_lower: lowercasing or not
"""
return [tokenize(tknzr, s, to_lower) for s in sentences]
fileName = sys.argv[1]
SNLP_TAGGER_JAR = "/home/pgupta/stanford-postagger.jar"
sentences = []
with open(fileName, 'r') as fileinput:
for line in fileinput:
sentences.append(line)
tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
s = ' <delimiter> '.join(sentences)
tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
for sentence in tokenized_sentences_SNLP:
print (sentence)