-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaspell.py
72 lines (60 loc) · 2.55 KB
/
aspell.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import logging
import subprocess
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Aspell:
def __init__(self):
self.aspell = self._spawn()
def _spawn(self):
aspell = subprocess.Popen(["aspell", "-a"], stdout=subprocess.PIPE,
stdin=subprocess.PIPE,
#stderr=subprocess.STDOUT,
bufsize=1,
#restore_signals=False,
encoding="iso-8859-1")
#universal_newlines=True)
aspell.stdout.readline()
return aspell
def _recycle(self):
if self.aspell.poll() is not None:
self.aspell.terminate()
del self.aspell
self.aspell = self._spawn()
def _write(self, text):
return self.aspell.stdin.write( ("^" + text.rstrip() + "\n") )
def _readline(self):
return self.aspell.stdout.readline()
def _parse_correction(self, response):
first, second = response[2:].split(":")
orig, count, offset = first.split(" ")
correction = second.split(",")[0][1:]
return orig, count, offset, correction
def _replace(self, string, response):
orig, count, offset, correction = self._parse_correction(response)
return self.replace_by_slice(string, slice(int(offset), int(offset) + len(orig)), correction)
def replace_by_slice(self, string, slyce, sub):
return string[:slyce.start - 1] + sub + string[slyce.stop - 1:]
def correct(self, text):
try:
self._write(text)
except BrokenPipeError:
self._recycle()
self._write(text)
while True:
response = self._readline()
if response == "\n": break # finished with this sentence
elif response[0] == "*": continue # no spelling error for the current word
elif response[0] == "&": text = self._replace(text, response) # suggestions available to correct current word
elif response[0] == "#": continue # no suggestions, but the word is not recognized
else: logger.error("unknown response, received: {}".format(response))
return text
if __name__ == "__main__":
import preproc
import pandas as pd
speller = Aspell()
df = pd.read_csv("valid5000.csv")
for i in range(5):
print(df['text'][i])
df = preproc.preproc(df, lower=True, spelling=True)
for i in range(5):
print(df['text'][i])