-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_named_entities.rb
46 lines (41 loc) · 962 Bytes
/
get_named_entities.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
require 'rubygems'
require 'nokogiri'
require 'hpricot'
require 'socket'
require 'open-uri'
require 'active_support/inflector'
ENTITIES = []
entities = File.open('ENTITIES').read
entities.each_line {|line|
ENTITIES.push line.strip
}
if ARGV.length != 1
url = "https://en.wikipedia.org/wiki/Special:Random"
else
url = ARGV[0]
end
def retrieve_data(url)
doc = Nokogiri::HTML( open(url) )
doc.css('p','li').text
end
def get_named_entities(url)
client = TCPSocket.open('localhost', 8080)
client.puts( retrieve_data(url).gsub(/\s+/, ' ') )
ner_response = ""
while line = client.gets
ner_response += line
end
client.close_read
for type in ENTITIES
tagged_entities = Hpricot(ner_response)
output = []
(tagged_entities/type).each do |f|
output.push f.inner_text
end
if output.size > 0
puts "#{type.to_s.pluralize.swapcase}:"
output.each {|e| puts "\t#{e}"}
end
end
end
get_named_entities(url)