diff --git a/README b/README index 8697cde..54fad81 100644 --- a/README +++ b/README @@ -25,9 +25,10 @@ INSTRUCTIONS Ingest ------ - 1. Copy ingest/target/wikisearch-ingest-*.jar and ingest/target/dependency/protobuf-java-*.jar to $ACCUMULO_HOME/lib/ext - 2. Run ingest/bin/ingest.sh with one argument (the name of the directory in HDFS where the wikipedia XML - files reside) and this will kick off a MapReduce job to ingest the data into Accumulo + 1. Copy ingest/target/wikisearch-ingest-*.tar.gz to cluster and untar + 2. Copy lib/wikisearch-ingest-*.jar and lib/protobuf-java-*.jar to $ACCUMULO_HOME/lib/ext + 3. Run bin/ingest.sh with one argument: the name of the directory in HDFS where the wikipedia XML + files reside, this will start a MapReduce job to ingest the data into Accumulo (For parallel ingest, instead run ingest/bin/ingest_parallel.sh) Query diff --git a/ingest/bin/ingest.sh b/ingest/bin/ingest.sh index aff15d3..434e29e 100755 --- a/ingest/bin/ingest.sh +++ b/ingest/bin/ingest.sh @@ -38,7 +38,7 @@ LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'` # # Map/Reduce job # -JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.4.4.jar +JAR=$SCRIPT_DIR/../lib/${project.build.finalName}.jar CONF=$SCRIPT_DIR/../conf/wikipedia.xml HDFS_DATA_DIR=$1 export HADOOP_CLASSPATH=$CLASSPATH diff --git a/ingest/bin/ingest_parallel.sh b/ingest/bin/ingest_parallel.sh index 2f77520..e214171 100755 --- a/ingest/bin/ingest_parallel.sh +++ b/ingest/bin/ingest_parallel.sh @@ -38,7 +38,7 @@ LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'` # # Map/Reduce job # -JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.4.4.jar +JAR=$SCRIPT_DIR/../lib/${project.build.finalName}.jar CONF=$SCRIPT_DIR/../conf/wikipedia.xml HDFS_DATA_DIR=$1 export HADOOP_CLASSPATH=$CLASSPATH diff --git a/ingest/src/assembly/dist.xml b/ingest/src/assembly/dist.xml index b49ebb3..60535df 100644 --- a/ingest/src/assembly/dist.xml +++ b/ingest/src/assembly/dist.xml @@ -36,10 +36,12 @@ ${project.basedir}/bin + true 0744 ${project.basedir}/conf + true 0644