-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataReceived.java
84 lines (75 loc) · 2.46 KB
/
DataReceived.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import java.util.ArrayList;
import java.util.TreeMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
* class to scrape through the input links
*/
public class DataReceived {
//Function will be called each time for the no. of input links.
//Search for the word through trie.
//Display each word count through their respective maps found in the respective pages
public Integer getData(String link,String searchWord)
{
int count = 0;
try
{
Trie myTrie = new Trie();
//Using the jsoup to read through each webpage.
Document document = Jsoup.connect(link).get();
//Reading only the paragraph tags from the specified link.
Elements paragraph = document.select("p");
TreeMap<String, Integer> frequencyData = new TreeMap<String, Integer>();
//Looping through all the paragraphs
for(Element para : paragraph)
{
String p = para.text();
p = p.replaceAll("[,.!?:;()-]", "\\s");//removing all the punctuations and replacing with blank spaces.
//for each of the above paragraph extract individual words and insert them in trie data structure.
for (String word : p.split(" ")) {
if (StopWord.is(word.toLowerCase())) continue;
myTrie.insert(word.toLowerCase());
count = PageRanking.getCount(word, frequencyData) + 1 ;
frequencyData.put(word, count);
}
}
int temp = PageRanking.getCount(searchWord, frequencyData);
count = temp;
// System.out.println(searchWord+" found in "+ link + " "+myTrie.search(searchWord));
// System.out.println(searchWord+" occurred "+temp+" times ");
}
catch(Exception e)
{
e.printStackTrace();
}
return count;
}
//Function overloaded to calculate frequency of all words.
public void getData(String link,TreeMap<String, Integer> frequencyData)
{
int count = 0;
try
{
Trie myTrie = new Trie();
Document document = Jsoup.connect(link).get();
Elements paragraph = document.select("p");
for(Element para : paragraph)
{
String p = para.text();
p = p.replaceAll("[,.!?:;()-]", "\\s");
for (String word : p.split(" ")) {
if (StopWord.is(word.toLowerCase())) continue;
myTrie.insert(word.toLowerCase());
count = PageRanking.getCount(word, frequencyData) + 1 ;
frequencyData.put(word, count);
}
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
}