-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcomb.rb
61 lines (45 loc) · 1.6 KB
/
comb.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
require './common.rb'
DB = Mongo::Connection.new.db('crunchbase_data')
people = DB.collection('person')
puts "We have #{people.size} in our Person's Database"
puts "Inspecting each person for web links (linkedin, facebook, etc)"
URL_PATTERNS = {
"twitter_url" => /twitter\.com/,
"facebook_url" => /facebook\.com/,
"linkedin_url" => /linkedin\.com\/in/
}
def extract_links_from_people( query )
db = Mongo::Connection.new.db('crunchbase_data')
people = db.collection("person")
# peoples = people.find( { "$and" => [ query, { tag => { "$exists" => 0 } } ] } )
peoples = people.find(query)
puts "Tagging #{peoples.count} profiles"
peoples.each do |person|
urls_from_profile = []
person["crunch_profile"].each do |profile|
urls_from_profile << profile['homepage_url'] unless profile['homepage_url'].blank?
profile["web_presences"].each do |hash|
urls_from_profile << hash["external_url"]
end
end
urls_from_profile = urls_from_profile.compact
next if urls_from_profile.empty?
# have some urls, let's now classify
person['extracted_urls'] = urls_from_profile
person['extracted_urls'].each do |url|
URL_PATTERNS.each do |url_name,regex_pattern|
if url =~ regex_pattern
person[url_name] = url
end
end
end
# at a minimum we're saving a person object with urls extracted
# best case if we're saving a perosn + urls + some extra tags
people.update({"_id" => person['_id']},person)
end
URL_PATTERNS.each do |url_name,regex_pattern|
puts "People with #{url_name}"
puts people.find({url_name => { "$exists"=> "true" }}).count
end
end
extract_links_from_people({})