From 2f488853f9b631016b8fcec38f48ba804da0c100 Mon Sep 17 00:00:00 2001 From: Dafydd James Date: Fri, 13 Jun 2014 22:19:14 +0100 Subject: [PATCH] #2 Initial nutch/hbase/elasticsearch scripts. --- README.md | 22 ++++++------ bin/build-nutch.bash | 40 ++++++++++++++++++++++ bin/index-url.bash | 47 +++++++++++++++++++++++++ bin/setenv.bash | 12 +++++++ bin/setup.bash | 59 ++++++++++++++++++++++++++++++++ bin/test_environment.py | 10 ++++++ bin/wget-deps.bash | 23 +++++++++++++ conf/README.md | 7 ++++ conf/nutch-site.xml | 37 ++++++++++++++++++++ conf/urls.txt | 1 + downloads/.gitignore | 4 +++ downloads/README.md | 1 + patches/ElasticWriter.java.patch | 11 ++++++ patches/ivy.xml.patch | 21 ++++++++++++ 14 files changed, 284 insertions(+), 11 deletions(-) create mode 100755 bin/build-nutch.bash create mode 100755 bin/index-url.bash create mode 100644 bin/setenv.bash create mode 100755 bin/setup.bash create mode 100755 bin/test_environment.py create mode 100755 bin/wget-deps.bash create mode 100644 conf/README.md create mode 100644 conf/nutch-site.xml create mode 100644 conf/urls.txt create mode 100644 downloads/.gitignore create mode 100644 downloads/README.md create mode 100644 patches/ElasticWriter.java.patch create mode 100644 patches/ivy.xml.patch diff --git a/README.md b/README.md index faba3d3..6e19513 100644 --- a/README.md +++ b/README.md @@ -117,17 +117,17 @@ Troubleshooting ### ClusterBlockException -[vagrant@localhost local]$ bin/nutch elasticindex elasticsearch -all -Exception in thread "elasticsearch[Caiera][generic][T#2]" org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];[SERVICE_UNAVAILABLE/2/no master]; - at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedException(ClusterBlocks.java:138) - at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedRaiseException(ClusterBlocks.java:128) - at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:197) - at org.elasticsearch.action.bulk.TransportBulkAction.access$000(TransportBulkAction.java:65) - at org.elasticsearch.action.bulk.TransportBulkAction$1.onFailure(TransportBulkAction.java:143) - at org.elasticsearch.action.support.TransportAction$ThreadedActionListener$2.run(TransportAction.java:117) - at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) - at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) - at java.lang.Thread.run(Thread.java:744) + [vagrant@localhost local]$ bin/nutch elasticindex elasticsearch -all + Exception in thread "elasticsearch[Caiera][generic][T#2]" org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];[SERVICE_UNAVAILABLE/2/no master]; + at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedException(ClusterBlocks.java:138) + at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedRaiseException(ClusterBlocks.java:128) + at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:197) + at org.elasticsearch.action.bulk.TransportBulkAction.access$000(TransportBulkAction.java:65) + at org.elasticsearch.action.bulk.TransportBulkAction$1.onFailure(TransportBulkAction.java:143) + at org.elasticsearch.action.support.TransportAction$ThreadedActionListener$2.run(TransportAction.java:117) + at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) + at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) + at java.lang.Thread.run(Thread.java:744) Check the following: * Your elasticsearch configuration is correct. diff --git a/bin/build-nutch.bash b/bin/build-nutch.bash new file mode 100755 index 0000000..7d87ebf --- /dev/null +++ b/bin/build-nutch.bash @@ -0,0 +1,40 @@ +#!/bin/bash +# Build nutch on CentOS 6 using apache ant with OpenJDK-1.7.0. +# Prerequisites: +# Must have openjdk RPM installed, and nutch and ant extracted into /opt +ts=`date +%Y%m%d%H%M%S` + +. `dirname $0`/setenv.bash + +if [ ! $JAVA_HOME ]; then + export JAVA_HOME="/etc/alternatives/java_sdk" +fi + +if [ ! $NUTCH_HOME ]; then + NUTCH_HOME="/opt/nutch" + echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME" +fi + +if [ ! $ANT_HOME ]; then + ANT_HOME="/opt/ant" + echo "!!! ANT_HOME not set, defaulting to $ANT_HOME" +fi + +cd $NUTCH_HOME +if [ -d runtime ]; then + echo "Backing up previous runtime directory" + mv runtime runtime.bak-$ts +fi + +patch -N src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java < $SRCROOT/patches/ElasticWriter.java.patch +patch -N ivy/ivy.xml < $SRCROOT/patches/ivy.xml.patch + +# build nutch +$ANT_HOME/bin/ant runtime + +# Install the custom config file. +cd runtime/local +cp /$SRCROOT/conf/nutch-site.xml conf/nutch-site.xml +mkdir urls && cp /$SRCROOT/conf/urls.txt urls + +echo "Build complete, now run the crawler (see the README for details)." diff --git a/bin/index-url.bash b/bin/index-url.bash new file mode 100755 index 0000000..5943d13 --- /dev/null +++ b/bin/index-url.bash @@ -0,0 +1,47 @@ +#!/bin/bash +# index-url.bash - index a file of URLs + +TOPN="10" + +if [ $# -ne 1 ]; then + echo "Error: insufficient parameters" + echo "Usage: $0 " + exit 1 +fi + +if [ ! $NUTCH_HOME ]; then + NUTCH_HOME="/opt/nutch" + echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME" +fi + +if [ ! $JAVA_HOME ]; then + export JAVA_HOME="/etc/alternatives/java_sdk" +fi + +urlfile="$(readlink -f $1)" +urldir="urls" +ts=`date +%Y%m%d%H%M%S` + +cd $NUTCH_HOME/runtime/local + +if [ -d $urldir ]; then + mv $urldir $urldir.bak-$ts +fi + +echo "*** Copying $urlfile to $urldir" +cp $urlfile $urldir + +echo "*** Running nutch crawl processes..." +echo "*** inject" +bin/nutch inject urls/ || exit 1 +echo "*** generate" +bin/nutch generate -topN $TOPN || exit 1 +echo "*** fetch" +bin/nutch fetch -all || exit 1 +echo "*** parse" +bin/nutch parse -all || exit 1 +echo "*** updatedb" +bin/nutch updatedb || exit 1 + +echo "*** indexing into elasticsearch" +bin/nutch elasticindex elasticsearch -all || exit 1 diff --git a/bin/setenv.bash b/bin/setenv.bash new file mode 100644 index 0000000..18b22df --- /dev/null +++ b/bin/setenv.bash @@ -0,0 +1,12 @@ +#!/bin/bash +HBASE_VER="0.90.4" +ANT_VER="1.9.4" +NUTCH_VER="2.2.1" + +HBASE_ARCHIVE="hbase-$HBASE_VER.tar.gz" +ANT_ARCHIVE="apache-ant-$ANT_VER-bin.tar.gz" +NUTCH_ARCHIVE="apache-nutch-$NUTCH_VER-src.tar.gz" +ARCHIVES="$HBASE_ARCHIVE $ANT_ARCHIVE $NUTCH_ARCHIVE" + +SRCROOT="/vagrant" +APPROOT="/opt" diff --git a/bin/setup.bash b/bin/setup.bash new file mode 100755 index 0000000..35ebf41 --- /dev/null +++ b/bin/setup.bash @@ -0,0 +1,59 @@ +#!/bin/bash +# setup.bash: +# - extract ant, nutch and hbase to /opt + +. `dirname $0`/setenv.bash + +DOWNLOADS="$SRCROOT/downloads" + +cd /opt + +NUTCH_HOME="/$APPROOT/apache-nutch-$NUTCH_VER" +if [ ! -d $NUTCH_HOME ]; then + echo "Nutch missing" + if [ -f $DOWNLOADS/$NUTCH_ARCHIVE ]; then + echo "Extracting nutch archive $NUTCH_ARCHIVE..." + cd $APPROOT + tar -xzf $DOWNLOADS/$NUTCH_ARCHIVE + echo "done." + else + echo "Need to download nutch, run wget-deps.bash" + exit 1 + fi +fi +ln -nsf $NUTCH_HOME $APPROOT/nutch + +ANT_HOME="$APPROOT/apache-ant-$ANT_VER" +if [ ! -d $ANT_HOME ]; then + echo "Ant missing" + if [ -f $DOWNLOADS/$ANT_ARCHIVE ]; then + echo "Extracting ant archive $ANT_ARCHIVE..." + cd $APPROOT + tar -xzf $DOWNLOADS/$ANT_ARCHIVE + echo "done." + else + echo "Need to download ant, run wget-deps.bash" + exit 1 + fi +fi +ln -nsf $ANT_HOME $APPROOT/ant + +HBASE_HOME="$APPROOT/hbase-$HBASE_VER" +if [ ! -d $HBASE_HOME ]; then + echo "hbase missing" + if [ -f $DOWNLOADS/$HBASE_ARCHIVE ]; then + echo "Extracting hbase archive $HBASE_ARCHIVE..." + cd $APPROOT + tar -xzf $DOWNLOADS/$HBASE_ARCHIVE + echo "done." + else + echo "Need to download hbase, run wget-deps.bash" + exit 1 + fi +fi +ln -nsf $HBASE_HOME $APPROOT/hbase + +echo "Now:" +echo " 1. Start hbase: /opt/hbase/bin/start-hbase.sh" +echo " 2. Build nutch: /vagrant/bin/build-nutch.bash" +echo " 3. Index a file of URLs: /vagrant/bin/index-url.bash /vagrant/conf/urls.txt" diff --git a/bin/test_environment.py b/bin/test_environment.py new file mode 100755 index 0000000..58db3c5 --- /dev/null +++ b/bin/test_environment.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# test_environment.py +import requests +import simplejson as json + +esBaseUrl = "http://localhost:9200" + +response = requests.get(esBaseUrl).json() +print json.dumps(response, indent=2) + diff --git a/bin/wget-deps.bash b/bin/wget-deps.bash new file mode 100755 index 0000000..5171853 --- /dev/null +++ b/bin/wget-deps.bash @@ -0,0 +1,23 @@ +#!/bin/bash +# wget-deps.bash - download dependencies +ROOTDIR="/vagrant" + +. `dirname $0`/setenv.bash + +HBASE_URL="http://archive.apache.org/dist/hbase/hbase-$HBASE_VER/$HBASE_ARCHIVE" +ANT_URL="http://mirror.catn.com/pub/apache/ant/binaries/$ANT_ARCHIVE" +NUTCH_URL="http://mirror.gopotato.co.uk/apache/nutch/2.2.1/$NUTCH_ARCHIVE" + +cd $ROOTDIR/downloads + +if [ ! -f $HBASE_ARCHIVE ]; then + wget $HBASE_URL +fi + +if [ ! -f $ANT_ARCHIVE ]; then + wget $ANT_URL +fi + +if [ ! -f $NUTCH_ARCHIVE ]; then + wget $NUTCH_URL +fi diff --git a/conf/README.md b/conf/README.md new file mode 100644 index 0000000..c10d0d2 --- /dev/null +++ b/conf/README.md @@ -0,0 +1,7 @@ +This is the `conf` directory, containing configuration files. + +Files +----- + +* `urls.txt` - a sample file containing URLs to be crawled by nutch. +* `nutch-site.xml` - XML configuration for nutch. Copied into nutch at build time. diff --git a/conf/nutch-site.xml b/conf/nutch-site.xml new file mode 100644 index 0000000..fbc160c --- /dev/null +++ b/conf/nutch-site.xml @@ -0,0 +1,37 @@ + + + + + + storage.data.store.class + org.apache.gora.hbase.store.HBaseStore + Default class for storing data + + + http.agent.name + Kusiri + + + + + elastic.cluster + elasticsearch + The cluster name to discover. Either host and potr must be defined + or cluster. + + + elastic.index + nutch + Default index to send documents to. + + + elastic.max.bulk.docs + 250 + Maximum size of the bulk in number of documents. + + + elastic.max.bulk.size + 2500500 + Maximum size of the bulk in bytes. + + \ No newline at end of file diff --git a/conf/urls.txt b/conf/urls.txt new file mode 100644 index 0000000..5b04785 --- /dev/null +++ b/conf/urls.txt @@ -0,0 +1 @@ +http://www.example.com diff --git a/downloads/.gitignore b/downloads/.gitignore new file mode 100644 index 0000000..e543ea9 --- /dev/null +++ b/downloads/.gitignore @@ -0,0 +1,4 @@ +* +!README.md +!.gitignore + diff --git a/downloads/README.md b/downloads/README.md new file mode 100644 index 0000000..0cc6549 --- /dev/null +++ b/downloads/README.md @@ -0,0 +1 @@ +Directory for holding downloaded archives. Do not commit anything in here. See ../README.md for this project's README file. diff --git a/patches/ElasticWriter.java.patch b/patches/ElasticWriter.java.patch new file mode 100644 index 0000000..823b475 --- /dev/null +++ b/patches/ElasticWriter.java.patch @@ -0,0 +1,11 @@ +--- src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java.orig 2014-06-09 19:48:24.008385551 +0000 ++++ src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java 2014-06-09 19:48:40.792385600 +0000 +@@ -101,7 +101,7 @@ + BulkResponse actionGet = execute.actionGet(); + if (actionGet.hasFailures()) { + for (BulkItemResponse item : actionGet) { +- if (item.failed()) { ++ if (item.isFailed()) { + throw new RuntimeException("First failure in bulk: " + + item.getFailureMessage()); + } diff --git a/patches/ivy.xml.patch b/patches/ivy.xml.patch new file mode 100644 index 0000000..d7c5c9d --- /dev/null +++ b/patches/ivy.xml.patch @@ -0,0 +1,21 @@ +--- ivy/ivy.xml.orig 2014-06-09 19:49:30.280385707 +0000 ++++ ivy/ivy.xml 2014-06-09 20:24:07.973383296 +0000 +@@ -32,7 +32,7 @@ + + + +- + + + --> + +- + +