#2 Initial nutch/hbase/elasticsearch scripts.

dawngerpony · Jun 13, 2014 · 2f48885 · 2f48885
1 parent c8158d4
commit 2f48885
Show file tree

Hide file tree

Showing 14 changed files with 284 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -117,17 +117,17 @@ Troubleshooting
 
 ### ClusterBlockException
 
-[vagrant@localhost local]$ bin/nutch elasticindex elasticsearch -all
-Exception in thread "elasticsearch[Caiera][generic][T#2]" org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];[SERVICE_UNAVAILABLE/2/no master];
-    at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedException(ClusterBlocks.java:138)
-    at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedRaiseException(ClusterBlocks.java:128)
-    at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:197)
-    at org.elasticsearch.action.bulk.TransportBulkAction.access$000(TransportBulkAction.java:65)
-    at org.elasticsearch.action.bulk.TransportBulkAction$1.onFailure(TransportBulkAction.java:143)
-    at org.elasticsearch.action.support.TransportAction$ThreadedActionListener$2.run(TransportAction.java:117)
-    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
-    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
-    at java.lang.Thread.run(Thread.java:744)
+    [vagrant@localhost local]$ bin/nutch elasticindex elasticsearch -all
+    Exception in thread "elasticsearch[Caiera][generic][T#2]" org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];[SERVICE_UNAVAILABLE/2/no master];
+        at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedException(ClusterBlocks.java:138)
+        at org.elasticsearch.cluster.block.ClusterBlocks.globalBlockedRaiseException(ClusterBlocks.java:128)
+        at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:197)
+        at org.elasticsearch.action.bulk.TransportBulkAction.access$000(TransportBulkAction.java:65)
+        at org.elasticsearch.action.bulk.TransportBulkAction$1.onFailure(TransportBulkAction.java:143)
+        at org.elasticsearch.action.support.TransportAction$ThreadedActionListener$2.run(TransportAction.java:117)
+        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
+        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
+        at java.lang.Thread.run(Thread.java:744)
 
 Check the following:
 * Your elasticsearch configuration is correct.

diff --git a/bin/build-nutch.bash b/bin/build-nutch.bash
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Build nutch on CentOS 6 using apache ant with OpenJDK-1.7.0.
+# Prerequisites:
+# Must have openjdk RPM installed, and nutch and ant extracted into /opt
+ts=`date +%Y%m%d%H%M%S`
+
+. `dirname $0`/setenv.bash
+
+if [ ! $JAVA_HOME ]; then
+    export JAVA_HOME="/etc/alternatives/java_sdk"
+fi
+
+if [ ! $NUTCH_HOME ]; then
+    NUTCH_HOME="/opt/nutch"
+    echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME"
+fi
+
+if [ ! $ANT_HOME ]; then
+    ANT_HOME="/opt/ant"
+    echo "!!! ANT_HOME not set, defaulting to $ANT_HOME"
+fi
+
+cd $NUTCH_HOME
+if [ -d runtime ]; then
+    echo "Backing up previous runtime directory"
+    mv runtime runtime.bak-$ts
+fi
+
+patch -N src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java < $SRCROOT/patches/ElasticWriter.java.patch
+patch -N ivy/ivy.xml < $SRCROOT/patches/ivy.xml.patch
+
+# build nutch
+$ANT_HOME/bin/ant runtime
+
+# Install the custom config file.
+cd runtime/local
+cp /$SRCROOT/conf/nutch-site.xml conf/nutch-site.xml
+mkdir urls && cp /$SRCROOT/conf/urls.txt urls
+
+echo "Build complete, now run the crawler (see the README for details)."
diff --git a/bin/index-url.bash b/bin/index-url.bash
@@ -0,0 +1,47 @@
+#!/bin/bash
+# index-url.bash - index a file of URLs
+
+TOPN="10"
+
+if [ $# -ne 1 ]; then
+    echo "Error: insufficient parameters"
+    echo "Usage: $0 <URL file>"
+    exit 1
+fi
+
+if [ ! $NUTCH_HOME ]; then
+    NUTCH_HOME="/opt/nutch"
+    echo "!!! NUTCH_HOME not set, defaulting to $NUTCH_HOME"
+fi
+
+if [ ! $JAVA_HOME ]; then
+    export JAVA_HOME="/etc/alternatives/java_sdk"
+fi
+
+urlfile="$(readlink -f $1)"
+urldir="urls"
+ts=`date +%Y%m%d%H%M%S`
+
+cd $NUTCH_HOME/runtime/local
+
+if [ -d $urldir ]; then
+    mv $urldir $urldir.bak-$ts
+fi
+
+echo "*** Copying $urlfile to $urldir"
+cp $urlfile $urldir
+
+echo "*** Running nutch crawl processes..."
+echo "*** inject"
+bin/nutch inject urls/ || exit 1
+echo "*** generate"
+bin/nutch generate -topN $TOPN || exit 1
+echo "*** fetch"
+bin/nutch fetch -all || exit 1
+echo "*** parse"
+bin/nutch parse -all || exit 1
+echo "*** updatedb"
+bin/nutch updatedb || exit 1
+
+echo "*** indexing into elasticsearch"
+bin/nutch elasticindex elasticsearch -all || exit 1
diff --git a/bin/setenv.bash b/bin/setenv.bash
@@ -0,0 +1,12 @@
+#!/bin/bash
+HBASE_VER="0.90.4"
+ANT_VER="1.9.4"
+NUTCH_VER="2.2.1"
+
+HBASE_ARCHIVE="hbase-$HBASE_VER.tar.gz"
+ANT_ARCHIVE="apache-ant-$ANT_VER-bin.tar.gz"
+NUTCH_ARCHIVE="apache-nutch-$NUTCH_VER-src.tar.gz"
+ARCHIVES="$HBASE_ARCHIVE $ANT_ARCHIVE $NUTCH_ARCHIVE"
+
+SRCROOT="/vagrant"
+APPROOT="/opt"
diff --git a/bin/setup.bash b/bin/setup.bash
@@ -0,0 +1,59 @@
+#!/bin/bash
+# setup.bash:
+#  - extract ant, nutch and hbase to /opt
+
+. `dirname $0`/setenv.bash
+
+DOWNLOADS="$SRCROOT/downloads"
+
+cd /opt
+
+NUTCH_HOME="/$APPROOT/apache-nutch-$NUTCH_VER"
+if [ ! -d $NUTCH_HOME ]; then
+    echo "Nutch missing"
+    if [ -f $DOWNLOADS/$NUTCH_ARCHIVE ]; then
+        echo "Extracting nutch archive $NUTCH_ARCHIVE..."
+        cd $APPROOT
+        tar -xzf $DOWNLOADS/$NUTCH_ARCHIVE
+        echo "done."
+    else
+        echo "Need to download nutch, run wget-deps.bash"
+        exit 1
+    fi
+fi
+ln -nsf $NUTCH_HOME $APPROOT/nutch
+
+ANT_HOME="$APPROOT/apache-ant-$ANT_VER"
+if [ ! -d $ANT_HOME ]; then
+    echo "Ant missing"
+    if [ -f $DOWNLOADS/$ANT_ARCHIVE ]; then
+        echo "Extracting ant archive $ANT_ARCHIVE..."
+        cd $APPROOT
+        tar -xzf $DOWNLOADS/$ANT_ARCHIVE
+        echo "done."
+    else
+        echo "Need to download ant, run wget-deps.bash"
+        exit 1
+    fi
+fi
+ln -nsf $ANT_HOME $APPROOT/ant
+
+HBASE_HOME="$APPROOT/hbase-$HBASE_VER"
+if [ ! -d $HBASE_HOME ]; then
+    echo "hbase missing"
+    if [ -f $DOWNLOADS/$HBASE_ARCHIVE ]; then
+        echo "Extracting hbase archive $HBASE_ARCHIVE..."
+        cd $APPROOT
+        tar -xzf $DOWNLOADS/$HBASE_ARCHIVE
+        echo "done."
+    else
+        echo "Need to download hbase, run wget-deps.bash"
+        exit 1
+    fi
+fi
+ln -nsf $HBASE_HOME $APPROOT/hbase
+
+echo "Now:"
+echo " 1. Start hbase: /opt/hbase/bin/start-hbase.sh"
+echo " 2. Build nutch: /vagrant/bin/build-nutch.bash"
+echo " 3. Index a file of URLs: /vagrant/bin/index-url.bash /vagrant/conf/urls.txt"
diff --git a/bin/test_environment.py b/bin/test_environment.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# test_environment.py
+import requests
+import simplejson as json
+
+esBaseUrl = "http://localhost:9200"
+
+response = requests.get(esBaseUrl).json()
+print json.dumps(response, indent=2)
+
diff --git a/bin/wget-deps.bash b/bin/wget-deps.bash
@@ -0,0 +1,23 @@
+#!/bin/bash
+# wget-deps.bash - download dependencies
+ROOTDIR="/vagrant"
+
+. `dirname $0`/setenv.bash
+
+HBASE_URL="http://archive.apache.org/dist/hbase/hbase-$HBASE_VER/$HBASE_ARCHIVE"
+ANT_URL="http://mirror.catn.com/pub/apache/ant/binaries/$ANT_ARCHIVE"
+NUTCH_URL="http://mirror.gopotato.co.uk/apache/nutch/2.2.1/$NUTCH_ARCHIVE"
+
+cd $ROOTDIR/downloads
+
+if [ ! -f $HBASE_ARCHIVE ]; then
+    wget $HBASE_URL
+fi
+
+if [ ! -f $ANT_ARCHIVE ]; then
+    wget $ANT_URL
+fi
+
+if [ ! -f $NUTCH_ARCHIVE ]; then
+    wget $NUTCH_URL
+fi
diff --git a/conf/README.md b/conf/README.md
@@ -0,0 +1,7 @@
+This is the `conf` directory, containing configuration files.
+
+Files
+-----
+
+* `urls.txt` - a sample file containing URLs to be crawled by nutch.
+* `nutch-site.xml` - XML configuration for nutch. Copied into nutch at build time.
diff --git a/conf/nutch-site.xml b/conf/nutch-site.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!-- Put site-specific property overrides in this file. -->
+<configuration>
+    <property>
+        <name>storage.data.store.class</name>
+        <value>org.apache.gora.hbase.store.HBaseStore</value>
+        <description>Default class for storing data</description>
+    </property>
+    <property>
+        <name>http.agent.name</name>
+        <value>Kusiri</value>
+        <description/>
+    </property>
+    <!-- Elasticsearch properties -->
+    <property>
+        <name>elastic.cluster</name>
+        <value>elasticsearch</value>
+        <description>The cluster name to discover. Either host and potr must be defined
+  or cluster.</description>
+    </property>
+    <property>
+        <name>elastic.index</name>
+        <value>nutch</value>
+        <description>Default index to send documents to.</description>
+    </property>
+    <property>
+        <name>elastic.max.bulk.docs</name>
+        <value>250</value>
+        <description>Maximum size of the bulk in number of documents.</description>
+    </property>
+    <property>
+        <name>elastic.max.bulk.size</name>
+        <value>2500500</value>
+        <description>Maximum size of the bulk in bytes.</description>
+    </property>
+</configuration>
diff --git a/conf/urls.txt b/conf/urls.txt
@@ -0,0 +1 @@
+http://www.example.com
diff --git a/downloads/.gitignore b/downloads/.gitignore
@@ -0,0 +1,4 @@
+*
+!README.md
+!.gitignore
+
diff --git a/downloads/README.md b/downloads/README.md
@@ -0,0 +1 @@
+Directory for holding downloaded archives. Do not commit anything in here. See ../README.md for this project's README file.
diff --git a/patches/ElasticWriter.java.patch b/patches/ElasticWriter.java.patch
@@ -0,0 +1,11 @@
+--- src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java.orig	2014-06-09 19:48:24.008385551 +0000
++++ src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java	2014-06-09 19:48:40.792385600 +0000
+@@ -101,7 +101,7 @@
+       BulkResponse actionGet = execute.actionGet();
+       if (actionGet.hasFailures()) {
+         for (BulkItemResponse item : actionGet) {
+-          if (item.failed()) {
++          if (item.isFailed()) {
+             throw new RuntimeException("First failure in bulk: "
+                 + item.getFailureMessage());
+           }
diff --git a/patches/ivy.xml.patch b/patches/ivy.xml.patch
@@ -0,0 +1,21 @@
+--- ivy/ivy.xml.orig	2014-06-09 19:49:30.280385707 +0000
++++ ivy/ivy.xml	2014-06-09 20:24:07.973383296 +0000
+@@ -32,7 +32,7 @@
+   </publications>
+
+   <dependencies>
+-    <dependency org="org.elasticsearch" name="elasticsearch" rev="0.19.4" 
++    <dependency org="org.elasticsearch" name="elasticsearch" rev="1.1.1" 
+                 conf="*->default"/>
+
+     <dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0"
+@@ -111,9 +111,7 @@
+     <dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/> 
+     -->
+     <!-- Uncomment this to use HBase as Gora backend. -->
+-    <!-- 
+     <dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />
+-    -->
+     <!-- Uncomment this to use Accumulo as Gora backend. -->
+     <!-- 
+     <dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" />
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Directory for holding downloaded archives. Do not commit anything in here. See ../README.md for this project's README file.