-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapachecker.py
168 lines (147 loc) · 5.68 KB
/
apachecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Copyright 2009, Tarmo Toikkanen, [email protected]
#License: GPL v2
#
# This script can be used to check that references in an APA formatted manuscript
# are written correctly and match the list of references.
#
# TODO: Check that et al. is used correctly (only for 3+ authors, and not on first
# occasion unless 6+ authors).
import sys
import re
import locale
import codecs
VERSION = 0.14
locale.setlocale(locale.LC_ALL,'')
if len(sys.argv)!=2:
print "APAChecker v%.2f by Tarmo Toikkanen\n" % VERSION
print """
Usage:
apachecker.py INPUTFILE.TXT
Save your manuscript in plain text format and supply it as a parameter.
Note that the report may be quite long. Use command
apachecker.py INPUTFILE.TXT >REPORT.TXT
to get a report file that you can examine later.
Known limitations:
1. Reference list is expected to start with heading "References" or "Bibliography",
immediately followed by reference entries.
2. If a single paragraph has several citations, and at least one is
formatted correctly, others may not be spotted. Usually, though, they are detected with
partial author information, which will show up in the report.
3. The references list is only checked to contain names and years in correct notation.
Other information is ignored.
4. Exact details on citation formatting may vary from journal to journal. This script
detects only a small subset of those. If you get notices of unrecognized citations, you
may need to tweak the regexps in this script.
"""
sys.exit(1)
NAME=u'(?:de la )?(?:De )?(?:Ma?c)?[A-ZÅÄÖ][a-zäöå]+(?: Inc.)?'
re_cite = (
re.compile('((?:%s, ){2,6})and (%s) \(([0-9]{4})\)' % (NAME,NAME)),
re.compile('(%s) and (%s) \(([0-9]{4})\)' % (NAME,NAME)),
re.compile('(?:(%s) )+\(([0-9]{4})\)' % NAME),
re.compile('(%s) (et al\.) \(([0-9]{4})\)' % NAME),
re.compile('(%s)\'s \(([0-9]{4})\)' % NAME),
re.compile('((?:%s, ){2,5})& (%s), ([0-9]{4})' % (NAME,NAME)),
re.compile('((?:%s, ){2,5})and (%s), ([0-9]{4})' % (NAME,NAME)),
re.compile('(%s) and (%s), ([0-9]{4})' % (NAME,NAME)),
re.compile('(%s) & (%s), ([0-9]{4})' % (NAME,NAME)),
re.compile('(%s), ([0-9]{4})' % NAME),
re.compile('(%s) (et al\.), ([0-9]{4})' % NAME),
)
re_modeswitch = re.compile('(References|REFERENCES|Bibliography)$')
NAME_F=NAME+u',(?: [A-ZÄÖÅ](?:-[A-ZÄÖÅ])?\.)+'
re_references = (
re.compile('^((?:%s, ){2,5})& (%s) \(([0-9]{4})\)\. (.*\.)' % (NAME_F,NAME_F)),
re.compile('^((?:%s, ){2,5})and (%s) \(([0-9]{4})\)\. (.*\.)' % (NAME_F,NAME_F)),
re.compile('^(%s) & (%s) \(([0-9]{4})\)\. (.*\.)' % (NAME_F,NAME_F)),
re.compile('^(%s) and (%s) \(([0-9]{4})\)\. (.*\.)' % (NAME_F,NAME_F)),
re.compile('^(%s) \(([0-9]{4})\)\. (.*\.)' % NAME_F),
)
re_suspect = re.compile('([12][0-9]{3})')
mode = 'collecting'
citations = []
references = []
def cleanup(res):
year = res[-1]
names = []
for n in res[:-1]:
names+=[x.strip() for x in n.split(',')]
return [x for x in names if x],year
fin = codecs.open(sys.argv[1],'r','utf-8')
for line in fin.readlines():
if re_modeswitch.search(line):
mode='checking'
continue
if mode=='collecting':
found=[]
rule=0
for reg in re_cite:
rule+=1
for res in reg.finditer(line):
pos = res.end(res.lastindex)
if not pos in found:
#print u'%s (#%d)' % (res.groups(),rule)
citations.append(cleanup(res.groups()))
found.append(pos)
else:
#print (u"RULE #%d COLLIDES WITH PREVIOUS: %s" % (rule,res.groups())).encode('utf-8')
rule=rule
if not found:
if re_suspect.search(line):
print (u'UNRECOGNIZED POTENTIAL CITATION IN THIS PARAGRAPH:\n%s' % line).encode('utf-8')
else:
#print (u"FROM %s" % line).encode('utf-8')
rule=rule
elif mode=='checking':
if len(line)<2:
continue
found=0
for re in re_references:
res = re.match(line)
if res:
found+=1
references.append(cleanup(res.groups()[:-1]))
print (u'LOCATED REFERENCE ITEM: %s' % repr(res.groups()))
#print res.groups()
if not found:
print (u'UNRECOGNIZED REFERENCE ITEM:\n%s' % line).encode('utf-8')
cok=0
cfail=0
for cite in citations:
cnames,cyear = cite
for ref in references:
rnames,ryear=ref
if cyear!=ryear:
continue
for name in cnames:
if name!='et al.' and name not in rnames:
break
else:
print "OK: Citation %s, %s FOUND in references!" % (cnames,cyear)
cok+=1
break
else:
cfail+=1
print "PROBLEM: No reference information for citation %s, %s" % (cnames,cyear)
rok=0
rfail=0
for ref in references:
rnames,ryear = ref
for cite in citations:
cnames,cyear=cite
if ryear!=cyear:
continue
for name in cnames:
if name!='et al.' and name not in rnames:
break
else:
print "OK: Reference item %s, %s CITED in manuscript!" % (rnames,ryear)
rok+=1
break
else:
rfail+=1
print "PROBLEM: No citation for reference %s, %s" % (rnames,ryear)
print "\nCITATIONS: %d OK, %d FAIL; %d%% SUCCESS" % (cok,cfail,100*cok/(cok+cfail))
print "REFERENCES: %d OK, %d FAIL; %d%% SUCCESS" % (rok,rfail,100*rok/(rok+rfail))