-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtexttoconll.py
121 lines (103 loc) · 3.49 KB
/
texttoconll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import sys
import re
import os
from os.path import basename
from cStringIO import StringIO
sys.path.append(os.path.join(os.path.dirname(__file__), 'mylib'))
sys.path.append('.')
from sentencesplit import sentencebreaks_to_newlines
# import mytokenizer
def regex_or(*items):
r = '|'.join(items)
r = '(' + r + ')'
return r
# Oct23: overcome cases like i.e
API_pattern = re.compile(
regex_or(r'^(?:[a-zA-Z_][a-zA-Z_]+\.)+[a-zA-Z_][a-zA-Z_]+\(\)$',
r'^[a-zA-Z\.\_][a-zA-Z\.\_]+\(\)$',
r'^(?:[a-zA-Z_][a-zA-Z_]+\.)+[a-zA-Z_][a-zA-Z_]+$',
r'^(?:[A-Za-z]+)+[A-Z][a-z]+$' )
)
# TOKENIZATION_REGEX = re.compile(API)
NEWLINE_TERM_REGEX = re.compile(r'(.*?\n)')
api_list = []
def text_to_conll(f):
"""Convert plain text into CoNLL format."""
sentences = []
for l in f:
l = sentencebreaks_to_newlines(l)
sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])
lines = []
for s in sentences:
nonspace_token_seen = False
tokens = [t for t in s.split() if t]
for i,t in enumerate(tokens):
if not t.isspace():
# pre label rules designed by Deheng
#if API_pattern.match(t) is not None:
# lines.append([t, 'B-API'])
if i < len(tokens) - 2:
comp = tokens[i-1] + t + tokens[i+1]
comp = comp.lower()
else:
comp = ""
if t.endswith("()"):
#print t
t_nobracket = t[:-2]
if t_nobracket.lower() in api_list:
lines.append([t, 'B-API'])
else:
lines.append([t, 'O'])
elif t.lower() in api_list:
#print t
lines.append([t, 'B-API'])
elif comp in api_list:
print comp
lines.append([t, 'B-API'])
else:
lines.append([t, 'O'])
nonspace_token_seen = True
# sentences delimited by empty lines
if nonspace_token_seen:
lines.append([])
lines = [[l[0], l[1]] if l else l for l in lines]
return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def build_list():
#f = open('./apidoc/all-remove.txt', 'r')
#for line in f:
# api = line.strip()
# api_list.append(api)
#return api_list
with open('apidoc/all-remove.txt', 'r') as gaz:
for line in gaz:
line = str(line.strip())
line = line.lower()
api_list.append(line)
with open('apidoc/ambiguousAPI.txt', 'r') as gaz2:
for line in gaz2:
line = str(line.strip())
line = "`" + line.lower() + "`"
api_list.append(line)
with open('apidoc/real_amb.txt', 'r') as gaz3:
for line in gaz3:
line = str(line.strip())
line = line.lower()
api_list.append(line)
return api_list
def main(arg1, arg2):
api_list = build_list()
'''
if arg1.endswith('.txt'):
filebase = '.'.join(arg1.split('.')[:-1]) if '.' in arg1 else arg1
tokenfile = str(filebase) + '.tk'
mytokenizer.tokenize(arg1, tokenfile)
f = open(tokenfile, 'r')
'''
f = open(arg1, 'r')
lines = text_to_conll(f)
with open(arg2, 'wt') as of:
of.write(''.join(lines))
of.write('\n')
if __name__ == '__main__':
main(*sys.argv[1:])