forked from npryce/code-words
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode-to-words
executable file
·52 lines (42 loc) · 1.1 KB
/
code-to-words
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/bash
set -e
basedir=`dirname "$0"`
keyword_opts=()
stopword_opts=()
while getopts k:s:h opt
do
case $opt in
k)
keyword_opts+=( "-f" "$OPTARG" )
;;
s)
stopword_opts+=( "-f" "$OPTARG" )
;;
h)
echo "$0 [[-k <keywords-file>]|[-s <stopwords-file>]]* [<input-file>]*"
exit
;;
esac
done
# Discard punctuation & numeric literals.
function extract_identifiers() {
sed -e 's/0[xX][[:alnum:]]\+//g' -e 's/[^[:alpha:]_]\+/\n/g' | grep -v '^$'
}
# Split camel case into individual words, taking into account all-caps
# abbreviations, such as XML or JPEG, and split at underscores
function split_words() {
sed -e 's/\([[:lower:]]\)\([[:upper:]]\)/\1\n\2/g' \
-e 's/\([[:upper:]]\+\)\([[:upper:]][[:lower:]]\)/\1\n\2/g' \
-e 's/_\+/\n/g'
}
function ignore_keywords() {
grep -vw "${keyword_opts[@]}" -e "^$"
}
function ignore_stopwords() {
grep -vw "${stopword_opts[@]}" -e . -e ..
}
function lowercase() {
tr [:upper:] [:lower:]
}
# Should do stemming (with Snowball, for example)
extract_identifiers | ignore_keywords | split_words | lowercase | ignore_stopwords | sort