Skip to content

Commit

Permalink
#2 Initial nutch/hbase/elasticsearch scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
Dafydd James committed Jun 13, 2014
1 parent c8158d4 commit 2f48885
Show file tree
Hide file tree
Showing 14 changed files with 284 additions and 11 deletions.
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,17 @@ Troubleshooting

### ClusterBlockException

[vagrant@localhost local]$ bin/nutch elasticindex elasticsearch -all
Exception in thread "elasticsearch[Caiera][generic][T#2]" org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];[SERVICE_UNAVAILABLE/2/no master];
at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedException(ClusterBlocks.java:138)
at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedRaiseException(ClusterBlocks.java:128)
at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:197)
at org.elasticsearch.action.bulk.TransportBulkAction.access$000(TransportBulkAction.java:65)
at org.elasticsearch.action.bulk.TransportBulkAction$1.onFailure(TransportBulkAction.java:143)
at org.elasticsearch.action.support.TransportAction$ThreadedActionListener$2.run(TransportAction.java:117)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
[vagrant@localhost local]$ bin/nutch elasticindex elasticsearch -all
Exception in thread "elasticsearch[Caiera][generic][T#2]" org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];[SERVICE_UNAVAILABLE/2/no master];
at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedException(ClusterBlocks.java:138)
at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedRaiseException(ClusterBlocks.java:128)
at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:197)
at org.elasticsearch.action.bulk.TransportBulkAction.access$000(TransportBulkAction.java:65)
at org.elasticsearch.action.bulk.TransportBulkAction$1.onFailure(TransportBulkAction.java:143)
at org.elasticsearch.action.support.TransportAction$ThreadedActionListener$2.run(TransportAction.java:117)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)

Check the following:
* Your elasticsearch configuration is correct.
Expand Down
40 changes: 40 additions & 0 deletions bin/build-nutch.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
# Build nutch on CentOS 6 using apache ant with OpenJDK-1.7.0.
# Prerequisites:
# Must have openjdk RPM installed, and nutch and ant extracted into /opt
ts=`date +%Y%m%d%H%M%S`

. `dirname $0`/setenv.bash

if [ ! $JAVA_HOME ]; then
export JAVA_HOME="/etc/alternatives/java_sdk"
fi

if [ ! $NUTCH_HOME ]; then
NUTCH_HOME="/opt/nutch"
echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME"
fi

if [ ! $ANT_HOME ]; then
ANT_HOME="/opt/ant"
echo "!!! ANT_HOME not set, defaulting to $ANT_HOME"
fi

cd $NUTCH_HOME
if [ -d runtime ]; then
echo "Backing up previous runtime directory"
mv runtime runtime.bak-$ts
fi

patch -N src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java < $SRCROOT/patches/ElasticWriter.java.patch
patch -N ivy/ivy.xml < $SRCROOT/patches/ivy.xml.patch

# build nutch
$ANT_HOME/bin/ant runtime

# Install the custom config file.
cd runtime/local
cp /$SRCROOT/conf/nutch-site.xml conf/nutch-site.xml
mkdir urls && cp /$SRCROOT/conf/urls.txt urls

echo "Build complete, now run the crawler (see the README for details)."
47 changes: 47 additions & 0 deletions bin/index-url.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
# index-url.bash - index a file of URLs

TOPN="10"

if [ $# -ne 1 ]; then
echo "Error: insufficient parameters"
echo "Usage: $0 <URL file>"
exit 1
fi

if [ ! $NUTCH_HOME ]; then
NUTCH_HOME="/opt/nutch"
echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME"
fi

if [ ! $JAVA_HOME ]; then
export JAVA_HOME="/etc/alternatives/java_sdk"
fi

urlfile="$(readlink -f $1)"
urldir="urls"
ts=`date +%Y%m%d%H%M%S`

cd $NUTCH_HOME/runtime/local

if [ -d $urldir ]; then
mv $urldir $urldir.bak-$ts
fi

echo "*** Copying $urlfile to $urldir"
cp $urlfile $urldir

echo "*** Running nutch crawl processes..."
echo "*** inject"
bin/nutch inject urls/ || exit 1
echo "*** generate"
bin/nutch generate -topN $TOPN || exit 1
echo "*** fetch"
bin/nutch fetch -all || exit 1
echo "*** parse"
bin/nutch parse -all || exit 1
echo "*** updatedb"
bin/nutch updatedb || exit 1

echo "*** indexing into elasticsearch"
bin/nutch elasticindex elasticsearch -all || exit 1
12 changes: 12 additions & 0 deletions bin/setenv.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
HBASE_VER="0.90.4"
ANT_VER="1.9.4"
NUTCH_VER="2.2.1"

HBASE_ARCHIVE="hbase-$HBASE_VER.tar.gz"
ANT_ARCHIVE="apache-ant-$ANT_VER-bin.tar.gz"
NUTCH_ARCHIVE="apache-nutch-$NUTCH_VER-src.tar.gz"
ARCHIVES="$HBASE_ARCHIVE $ANT_ARCHIVE $NUTCH_ARCHIVE"

SRCROOT="/vagrant"
APPROOT="/opt"
59 changes: 59 additions & 0 deletions bin/setup.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
# setup.bash:
# - extract ant, nutch and hbase to /opt

. `dirname $0`/setenv.bash

DOWNLOADS="$SRCROOT/downloads"

cd /opt

NUTCH_HOME="/$APPROOT/apache-nutch-$NUTCH_VER"
if [ ! -d $NUTCH_HOME ]; then
echo "Nutch missing"
if [ -f $DOWNLOADS/$NUTCH_ARCHIVE ]; then
echo "Extracting nutch archive $NUTCH_ARCHIVE..."
cd $APPROOT
tar -xzf $DOWNLOADS/$NUTCH_ARCHIVE
echo "done."
else
echo "Need to download nutch, run wget-deps.bash"
exit 1
fi
fi
ln -nsf $NUTCH_HOME $APPROOT/nutch

ANT_HOME="$APPROOT/apache-ant-$ANT_VER"
if [ ! -d $ANT_HOME ]; then
echo "Ant missing"
if [ -f $DOWNLOADS/$ANT_ARCHIVE ]; then
echo "Extracting ant archive $ANT_ARCHIVE..."
cd $APPROOT
tar -xzf $DOWNLOADS/$ANT_ARCHIVE
echo "done."
else
echo "Need to download ant, run wget-deps.bash"
exit 1
fi
fi
ln -nsf $ANT_HOME $APPROOT/ant

HBASE_HOME="$APPROOT/hbase-$HBASE_VER"
if [ ! -d $HBASE_HOME ]; then
echo "hbase missing"
if [ -f $DOWNLOADS/$HBASE_ARCHIVE ]; then
echo "Extracting hbase archive $HBASE_ARCHIVE..."
cd $APPROOT
tar -xzf $DOWNLOADS/$HBASE_ARCHIVE
echo "done."
else
echo "Need to download hbase, run wget-deps.bash"
exit 1
fi
fi
ln -nsf $HBASE_HOME $APPROOT/hbase

echo "Now:"
echo " 1. Start hbase: /opt/hbase/bin/start-hbase.sh"
echo " 2. Build nutch: /vagrant/bin/build-nutch.bash"
echo " 3. Index a file of URLs: /vagrant/bin/index-url.bash /vagrant/conf/urls.txt"
10 changes: 10 additions & 0 deletions bin/test_environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python
# test_environment.py
import requests
import simplejson as json

esBaseUrl = "http://localhost:9200"

response = requests.get(esBaseUrl).json()
print json.dumps(response, indent=2)

23 changes: 23 additions & 0 deletions bin/wget-deps.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# wget-deps.bash - download dependencies
ROOTDIR="/vagrant"

. `dirname $0`/setenv.bash

HBASE_URL="http://archive.apache.org/dist/hbase/hbase-$HBASE_VER/$HBASE_ARCHIVE"
ANT_URL="http://mirror.catn.com/pub/apache/ant/binaries/$ANT_ARCHIVE"
NUTCH_URL="http://mirror.gopotato.co.uk/apache/nutch/2.2.1/$NUTCH_ARCHIVE"

cd $ROOTDIR/downloads

if [ ! -f $HBASE_ARCHIVE ]; then
wget $HBASE_URL
fi

if [ ! -f $ANT_ARCHIVE ]; then
wget $ANT_URL
fi

if [ ! -f $NUTCH_ARCHIVE ]; then
wget $NUTCH_URL
fi
7 changes: 7 additions & 0 deletions conf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
This is the `conf` directory, containing configuration files.

Files
-----

* `urls.txt` - a sample file containing URLs to be crawled by nutch.
* `nutch-site.xml` - XML configuration for nutch. Copied into nutch at build time.
37 changes: 37 additions & 0 deletions conf/nutch-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.hbase.store.HBaseStore</value>
<description>Default class for storing data</description>
</property>
<property>
<name>http.agent.name</name>
<value>Kusiri</value>
<description/>
</property>
<!-- Elasticsearch properties -->
<property>
<name>elastic.cluster</name>
<value>elasticsearch</value>
<description>The cluster name to discover. Either host and potr must be defined
or cluster.</description>
</property>
<property>
<name>elastic.index</name>
<value>nutch</value>
<description>Default index to send documents to.</description>
</property>
<property>
<name>elastic.max.bulk.docs</name>
<value>250</value>
<description>Maximum size of the bulk in number of documents.</description>
</property>
<property>
<name>elastic.max.bulk.size</name>
<value>2500500</value>
<description>Maximum size of the bulk in bytes.</description>
</property>
</configuration>
1 change: 1 addition & 0 deletions conf/urls.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://www.example.com
4 changes: 4 additions & 0 deletions downloads/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*
!README.md
!.gitignore

1 change: 1 addition & 0 deletions downloads/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Directory for holding downloaded archives. Do not commit anything in here. See ../README.md for this project's README file.
11 changes: 11 additions & 0 deletions patches/ElasticWriter.java.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java.orig 2014-06-09 19:48:24.008385551 +0000
+++ src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java 2014-06-09 19:48:40.792385600 +0000
@@ -101,7 +101,7 @@
BulkResponse actionGet = execute.actionGet();
if (actionGet.hasFailures()) {
for (BulkItemResponse item : actionGet) {
- if (item.failed()) {
+ if (item.isFailed()) {
throw new RuntimeException("First failure in bulk: "
+ item.getFailureMessage());
}
21 changes: 21 additions & 0 deletions patches/ivy.xml.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
--- ivy/ivy.xml.orig 2014-06-09 19:49:30.280385707 +0000
+++ ivy/ivy.xml 2014-06-09 20:24:07.973383296 +0000
@@ -32,7 +32,7 @@
</publications>

<dependencies>
- <dependency org="org.elasticsearch" name="elasticsearch" rev="0.19.4"
+ <dependency org="org.elasticsearch" name="elasticsearch" rev="1.1.1"
conf="*->default"/>

<dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0"
@@ -111,9 +111,7 @@
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/>
-->
<!-- Uncomment this to use HBase as Gora backend. -->
- <!--
<dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />
- -->
<!-- Uncomment this to use Accumulo as Gora backend. -->
<!--
<dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" />

0 comments on commit 2f48885

Please sign in to comment.