-
Notifications
You must be signed in to change notification settings - Fork 0
/
generateIndex.java
180 lines (140 loc) · 7.14 KB
/
generateIndex.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import java.nio.file.Paths;
import java.io.File;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
public class generateIndex {
// set the path
final String corpurspath = "corpus";
final String resultpath = "result";
//get the analyzer
public generateIndex(Analyzer ChosenAnalyzer) throws Exception {
// Direct to the path to store the index
// Given in Assignment for indexing in Lucene
Directory dir = FSDirectory.open(Paths.get(resultpath));
Analyzer analyzer = ChosenAnalyzer;
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE);
writer = new IndexWriter(dir, iwc);
}
IndexWriter writer;
// load the data
public void dataLoader() throws Exception{
final File fileCollect = new File(corpurspath);
for (final File fileEntry : fileCollect.listFiles()) {
File file = new File(corpurspath+"/"+fileEntry.getName()+"");
// if the file with a trectext extends, we add the file for parser
if(FilenameUtils.getExtension(file.getName()).equals("trectext")){
// call the parse function for parsing
dataParser(file);
}
}
writer.forceMerge(1);
writer.commit();
writer.close();
}
// for parse and store data
public void dataParser(File file) throws Exception{
// convert to string
String fileToString = FileUtils.readFileToString(file,(String)null);
// seperate by DOC tag
String lineDoc = System.getProperty("line.separator");
String individuleDocument[] = fileToString.split("</DOC>"+lineDoc+"<DOC>");
// for each document
for(int i=0; i<individuleDocument.length; i++){
Document luceneDoc = new Document();
// for each field
// we set a container stringbuffer to store string
// we only need one container and clear it every time after loading for each field
StringBuffer container = new StringBuffer();
// locate
// for docno
String docno = StringUtils.substringBetween(individuleDocument[i], "<DOCNO>", "</DOCNO>");
// insert into the container
container.append(docno);
luceneDoc.add(new StringField("DOCNO", container.toString(),Field.Store.YES));
//clear stringbuffer
container.setLength(0);
//for head
String head = StringUtils.substringBetween(individuleDocument[i], "<HEAD>", "</HEAD>");
container.append(head);
luceneDoc.add(new TextField("HEAD", container.toString(),Field.Store.YES));
container.setLength(0);
// for byline
String byline = StringUtils.substringBetween(individuleDocument[i], "<BYLINE>", "</BYLINE>");
container.append(byline);
luceneDoc.add(new TextField("BYLINE", container.toString(),Field.Store.YES));
container.setLength(0);
// for dateline
String dateline = StringUtils.substringBetween(individuleDocument[i], "<DATELINE>", "</DATELINE>");
container.append(dateline);
luceneDoc.add(new TextField("DATELINE", container.toString(),Field.Store.YES));
container.setLength(0);
// for text
String testsString = individuleDocument[i];
String text = StringUtils.substringBetween(testsString, "<TEXT>", "</TEXT>");
container.append(text);
container.append("\\s+");
testsString = testsString.substring(testsString.indexOf("</TEXT>")+7);
while(true){
text = StringUtils.substringBetween(testsString, "<TEXT>", "</TEXT>");
if(testsString.indexOf("</TEXT>")==-1){
break;
}
container.append(text);
container.append("\\s+");
testsString = testsString.substring(testsString.indexOf("</TEXT>")+7);
}
luceneDoc.add(new TextField("TEXT", container.toString(),Field.Store.YES));
writer.addDocument(luceneDoc);
}
}
//This function displays the statistics
public void printResandStats(String path) throws Exception{
// Given in the Assignment Requirement for useful statistics and some results
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(resultpath+'/')));
//Print the total number of documents in the corpus
System.out.println("Total number of documents in the corpus:"+reader.maxDoc());
//Print the number of documents containing the term "new" in <field>TEXT</field>.
System.out.println("Number of documents containing the term \"new\" for field \"TEXT\": "+reader.docFreq(new Term("TEXT", "new")));
//Print the total number of occurrences of the term "new" across all documents for <field>TEXT</field>.
System.out.println("Number of occurences of \"new\" in the field \"TEXT\": "+reader.totalTermFreq(new Term("TEXT","new")));
//Print the size of the vocabulary for <field>content</field>, only available per-segment.
Terms vocabulary = MultiFields.getTerms(reader, "TEXT");
System.out.println("Size of the vocabulary for this field: "+vocabulary.size());
//Print the total number of documents that have at least one term for <field>TEXT</field>
System.out.println("Number of documents that have at least one term for this field: "+vocabulary.getDocCount());
//Print the total number of tokens for <field>TEXT</field>
System.out.println("Number of tokens for this field: "+vocabulary.getSumTotalTermFreq());
//Print the total number of postings for <field>TEXT</field>
System.out.println("Number of postings for this field: "+vocabulary.getSumDocFreq());
//Print the vocabulary for <field>TEXT</field>
// TermsEnum iterator = vocabulary.iterator();
// System.out.println("\n*******Vocabulary-Start**********");
// BytesRef byteRef;
// while((byteRef = iterator.next()) != null) {
// String term = byteRef.utf8ToString();
// System.out.print(term+"\n");
// }
// System.out.println("\n*******Vocabulary-End**********");
reader.close();
}
public static void main(String[] args) throws Exception {
generateIndex StandAnalyz = new generateIndex(new StandardAnalyzer());
System.out.println("load the data......");
StandAnalyz.dataLoader();
System.out.println("Indexing......");
StandAnalyz.printResandStats("result");
System.out.println("Success for StandardAnalyzer!");
}
}