-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#2 Initial nutch/hbase/elasticsearch scripts.
- Loading branch information
Dafydd James
committed
Jun 13, 2014
1 parent
c8158d4
commit 2f48885
Showing
14 changed files
with
284 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
# Build nutch on CentOS 6 using apache ant with OpenJDK-1.7.0. | ||
# Prerequisites: | ||
# Must have openjdk RPM installed, and nutch and ant extracted into /opt | ||
ts=`date +%Y%m%d%H%M%S` | ||
|
||
. `dirname $0`/setenv.bash | ||
|
||
if [ ! $JAVA_HOME ]; then | ||
export JAVA_HOME="/etc/alternatives/java_sdk" | ||
fi | ||
|
||
if [ ! $NUTCH_HOME ]; then | ||
NUTCH_HOME="/opt/nutch" | ||
echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME" | ||
fi | ||
|
||
if [ ! $ANT_HOME ]; then | ||
ANT_HOME="/opt/ant" | ||
echo "!!! ANT_HOME not set, defaulting to $ANT_HOME" | ||
fi | ||
|
||
cd $NUTCH_HOME | ||
if [ -d runtime ]; then | ||
echo "Backing up previous runtime directory" | ||
mv runtime runtime.bak-$ts | ||
fi | ||
|
||
patch -N src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java < $SRCROOT/patches/ElasticWriter.java.patch | ||
patch -N ivy/ivy.xml < $SRCROOT/patches/ivy.xml.patch | ||
|
||
# build nutch | ||
$ANT_HOME/bin/ant runtime | ||
|
||
# Install the custom config file. | ||
cd runtime/local | ||
cp /$SRCROOT/conf/nutch-site.xml conf/nutch-site.xml | ||
mkdir urls && cp /$SRCROOT/conf/urls.txt urls | ||
|
||
echo "Build complete, now run the crawler (see the README for details)." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#!/bin/bash | ||
# index-url.bash - index a file of URLs | ||
|
||
TOPN="10" | ||
|
||
if [ $# -ne 1 ]; then | ||
echo "Error: insufficient parameters" | ||
echo "Usage: $0 <URL file>" | ||
exit 1 | ||
fi | ||
|
||
if [ ! $NUTCH_HOME ]; then | ||
NUTCH_HOME="/opt/nutch" | ||
echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME" | ||
fi | ||
|
||
if [ ! $JAVA_HOME ]; then | ||
export JAVA_HOME="/etc/alternatives/java_sdk" | ||
fi | ||
|
||
urlfile="$(readlink -f $1)" | ||
urldir="urls" | ||
ts=`date +%Y%m%d%H%M%S` | ||
|
||
cd $NUTCH_HOME/runtime/local | ||
|
||
if [ -d $urldir ]; then | ||
mv $urldir $urldir.bak-$ts | ||
fi | ||
|
||
echo "*** Copying $urlfile to $urldir" | ||
cp $urlfile $urldir | ||
|
||
echo "*** Running nutch crawl processes..." | ||
echo "*** inject" | ||
bin/nutch inject urls/ || exit 1 | ||
echo "*** generate" | ||
bin/nutch generate -topN $TOPN || exit 1 | ||
echo "*** fetch" | ||
bin/nutch fetch -all || exit 1 | ||
echo "*** parse" | ||
bin/nutch parse -all || exit 1 | ||
echo "*** updatedb" | ||
bin/nutch updatedb || exit 1 | ||
|
||
echo "*** indexing into elasticsearch" | ||
bin/nutch elasticindex elasticsearch -all || exit 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
HBASE_VER="0.90.4" | ||
ANT_VER="1.9.4" | ||
NUTCH_VER="2.2.1" | ||
|
||
HBASE_ARCHIVE="hbase-$HBASE_VER.tar.gz" | ||
ANT_ARCHIVE="apache-ant-$ANT_VER-bin.tar.gz" | ||
NUTCH_ARCHIVE="apache-nutch-$NUTCH_VER-src.tar.gz" | ||
ARCHIVES="$HBASE_ARCHIVE $ANT_ARCHIVE $NUTCH_ARCHIVE" | ||
|
||
SRCROOT="/vagrant" | ||
APPROOT="/opt" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/bin/bash | ||
# setup.bash: | ||
# - extract ant, nutch and hbase to /opt | ||
|
||
. `dirname $0`/setenv.bash | ||
|
||
DOWNLOADS="$SRCROOT/downloads" | ||
|
||
cd /opt | ||
|
||
NUTCH_HOME="/$APPROOT/apache-nutch-$NUTCH_VER" | ||
if [ ! -d $NUTCH_HOME ]; then | ||
echo "Nutch missing" | ||
if [ -f $DOWNLOADS/$NUTCH_ARCHIVE ]; then | ||
echo "Extracting nutch archive $NUTCH_ARCHIVE..." | ||
cd $APPROOT | ||
tar -xzf $DOWNLOADS/$NUTCH_ARCHIVE | ||
echo "done." | ||
else | ||
echo "Need to download nutch, run wget-deps.bash" | ||
exit 1 | ||
fi | ||
fi | ||
ln -nsf $NUTCH_HOME $APPROOT/nutch | ||
|
||
ANT_HOME="$APPROOT/apache-ant-$ANT_VER" | ||
if [ ! -d $ANT_HOME ]; then | ||
echo "Ant missing" | ||
if [ -f $DOWNLOADS/$ANT_ARCHIVE ]; then | ||
echo "Extracting ant archive $ANT_ARCHIVE..." | ||
cd $APPROOT | ||
tar -xzf $DOWNLOADS/$ANT_ARCHIVE | ||
echo "done." | ||
else | ||
echo "Need to download ant, run wget-deps.bash" | ||
exit 1 | ||
fi | ||
fi | ||
ln -nsf $ANT_HOME $APPROOT/ant | ||
|
||
HBASE_HOME="$APPROOT/hbase-$HBASE_VER" | ||
if [ ! -d $HBASE_HOME ]; then | ||
echo "hbase missing" | ||
if [ -f $DOWNLOADS/$HBASE_ARCHIVE ]; then | ||
echo "Extracting hbase archive $HBASE_ARCHIVE..." | ||
cd $APPROOT | ||
tar -xzf $DOWNLOADS/$HBASE_ARCHIVE | ||
echo "done." | ||
else | ||
echo "Need to download hbase, run wget-deps.bash" | ||
exit 1 | ||
fi | ||
fi | ||
ln -nsf $HBASE_HOME $APPROOT/hbase | ||
|
||
echo "Now:" | ||
echo " 1. Start hbase: /opt/hbase/bin/start-hbase.sh" | ||
echo " 2. Build nutch: /vagrant/bin/build-nutch.bash" | ||
echo " 3. Index a file of URLs: /vagrant/bin/index-url.bash /vagrant/conf/urls.txt" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/usr/bin/env python | ||
# test_environment.py | ||
import requests | ||
import simplejson as json | ||
|
||
esBaseUrl = "http://localhost:9200" | ||
|
||
response = requests.get(esBaseUrl).json() | ||
print json.dumps(response, indent=2) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/bin/bash | ||
# wget-deps.bash - download dependencies | ||
ROOTDIR="/vagrant" | ||
|
||
. `dirname $0`/setenv.bash | ||
|
||
HBASE_URL="http://archive.apache.org/dist/hbase/hbase-$HBASE_VER/$HBASE_ARCHIVE" | ||
ANT_URL="http://mirror.catn.com/pub/apache/ant/binaries/$ANT_ARCHIVE" | ||
NUTCH_URL="http://mirror.gopotato.co.uk/apache/nutch/2.2.1/$NUTCH_ARCHIVE" | ||
|
||
cd $ROOTDIR/downloads | ||
|
||
if [ ! -f $HBASE_ARCHIVE ]; then | ||
wget $HBASE_URL | ||
fi | ||
|
||
if [ ! -f $ANT_ARCHIVE ]; then | ||
wget $ANT_URL | ||
fi | ||
|
||
if [ ! -f $NUTCH_ARCHIVE ]; then | ||
wget $NUTCH_URL | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
This is the `conf` directory, containing configuration files. | ||
|
||
Files | ||
----- | ||
|
||
* `urls.txt` - a sample file containing URLs to be crawled by nutch. | ||
* `nutch-site.xml` - XML configuration for nutch. Copied into nutch at build time. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<?xml version="1.0"?> | ||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | ||
<!-- Put site-specific property overrides in this file. --> | ||
<configuration> | ||
<property> | ||
<name>storage.data.store.class</name> | ||
<value>org.apache.gora.hbase.store.HBaseStore</value> | ||
<description>Default class for storing data</description> | ||
</property> | ||
<property> | ||
<name>http.agent.name</name> | ||
<value>Kusiri</value> | ||
<description/> | ||
</property> | ||
<!-- Elasticsearch properties --> | ||
<property> | ||
<name>elastic.cluster</name> | ||
<value>elasticsearch</value> | ||
<description>The cluster name to discover. Either host and potr must be defined | ||
or cluster.</description> | ||
</property> | ||
<property> | ||
<name>elastic.index</name> | ||
<value>nutch</value> | ||
<description>Default index to send documents to.</description> | ||
</property> | ||
<property> | ||
<name>elastic.max.bulk.docs</name> | ||
<value>250</value> | ||
<description>Maximum size of the bulk in number of documents.</description> | ||
</property> | ||
<property> | ||
<name>elastic.max.bulk.size</name> | ||
<value>2500500</value> | ||
<description>Maximum size of the bulk in bytes.</description> | ||
</property> | ||
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
http://www.example.com |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
* | ||
!README.md | ||
!.gitignore | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Directory for holding downloaded archives. Do not commit anything in here. See ../README.md for this project's README file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
--- src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java.orig 2014-06-09 19:48:24.008385551 +0000 | ||
+++ src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java 2014-06-09 19:48:40.792385600 +0000 | ||
@@ -101,7 +101,7 @@ | ||
BulkResponse actionGet = execute.actionGet(); | ||
if (actionGet.hasFailures()) { | ||
for (BulkItemResponse item : actionGet) { | ||
- if (item.failed()) { | ||
+ if (item.isFailed()) { | ||
throw new RuntimeException("First failure in bulk: " | ||
+ item.getFailureMessage()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
--- ivy/ivy.xml.orig 2014-06-09 19:49:30.280385707 +0000 | ||
+++ ivy/ivy.xml 2014-06-09 20:24:07.973383296 +0000 | ||
@@ -32,7 +32,7 @@ | ||
</publications> | ||
|
||
<dependencies> | ||
- <dependency org="org.elasticsearch" name="elasticsearch" rev="0.19.4" | ||
+ <dependency org="org.elasticsearch" name="elasticsearch" rev="1.1.1" | ||
conf="*->default"/> | ||
|
||
<dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0" | ||
@@ -111,9 +111,7 @@ | ||
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/> | ||
--> | ||
<!-- Uncomment this to use HBase as Gora backend. --> | ||
- <!-- | ||
<dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" /> | ||
- --> | ||
<!-- Uncomment this to use Accumulo as Gora backend. --> | ||
<!-- | ||
<dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" /> |