Skip to content

Commit

Permalink
handle extracting textcontent of pages within a corpus matching a spe…
Browse files Browse the repository at this point in the history
…cific keyword (cf RISCOVAC)
  • Loading branch information
boogheta committed Aug 3, 2018
1 parent c1146fe commit 6ed6fb1
Showing 1 changed file with 72 additions and 5 deletions.
77 changes: 72 additions & 5 deletions bin/textExtractor/extract_webentities_text.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys, os
import sys, os, re
import pymongo
import jsonrpclib
from md5 import md5
Expand Down Expand Up @@ -34,6 +34,70 @@ def process_all(hyphe_core, mongo_pages_coll, corpus, status_to_extract=["IN"]):
print("collected a total of %s pages for %s webentities" % \
(total_pages, total_wes))

def process_pages_matching_keyword(hyphe_core, mongo_pages_coll, corpus, keyword, content_types=["text/plain", "text/html"]):
query = {
"status": 200,
"content_type": {"$in": content_types},
"body" : {"$exists": True}
}
print("TOTAL valid pages:", mongo_pages_coll.count(query))

headers = ["url", "webentity_id", "webentity_name"]
files = {}
for typ in ["html", "text", "canola"]:
files[typ] = open("%s-%s-%s.csv" % (corpus, keyword, typ), "w")
print >> files[typ], ",".join([k.encode("utf-8") for k in headers + [typ]])
match = 0
total = 0
for page in mongo_pages_coll.find(query):
total += 1
if not total % 100:
print match, "/", total

body = page["body"].decode('zip')
if keyword not in body:
continue
match +=1

encoding = page.get("encoding", "")
try:
body = body.decode(encoding)
except Exception :
body = body.decode("UTF8", "replace")
encoding = "UTF8-replace"

we = hyphe_core.store.get_webentity_for_url_as_lru(page["lru"], corpus)
try:
assert we["code"] == "success"
page["webentity_id"] = we["result"]["id"]
page["webentity_name"] = we["result"]["name"]
except:
print("WARNING! Could not resolve WebEntity for url %s" % page["url"])

page["html"] = body
page["text"] = textify(body, encoding=encoding)
page["canola"] = textify(body, extractor="CanolaExtractor", encoding=encoding)

for typ in ["html", "text", "canola"]:
if keyword not in page[typ]:
continue
print >> files[typ], ",".join([format_for_csv(page.get(k, "")) for k in headers + [typ]])

for typ in ["html", "text", "canola"]:
files[typ].close()

print('FOUND %s pages matching "%s"' % (match, keyword))

def format_for_csv(v):
if not v:
return ""
if type(v) != unicode:
v = unicode(v)
v = re.sub(ur"\s*[\r\n]+\s*", u" ↲ ", v).strip(u" ↲")
if "," in v:
v = '"%s"' % v.replace('"', '""')
return v.encode("utf-8")

def get_status_webentities(hyphe_core, status, corpus):
print("Retrieving %s web entities" % status)
res = hyphe_core.store.get_webentities_by_status(status, None, \
Expand Down Expand Up @@ -157,15 +221,17 @@ def process_page(page, wename, corpus, \
mongoport = 27017
mongodb = ""
corpus = ""
password = ""
keyword = ""

# Initiate Hyphe API connection and ensure corpus started
try:
hyphe = jsonrpclib.Server(api, version=1)
except Exception as e:
exit('Could not initiate connection to hyphe core')
start = hyphe.start_corpus(corpus, password)
if start['code'] == 'fail' :
exit(start['message'])
start = hyphe.start_corpus(corpus, password)
if start['code'] == 'fail' :
exit(start['message'])

# Initiate MongoDB connection and build index on pages
try:
Expand All @@ -176,4 +242,5 @@ def process_page(page, wename, corpus, \
ensure_index_on_pages(dbpages)

# Run!
process_all(hyphe, dbpages, corpus)
#process_all(hyphe, dbpages, corpus)
process_pages_matching_keyword(hyphe, dbpages, corpus, keyword)

0 comments on commit 6ed6fb1

Please sign in to comment.