Tools for indexing LOC to SOLR server
authordennis <dennis@indexdata.dk>
Thu, 5 Aug 2010 12:33:42 +0000 (14:33 +0200)
committerdennis <dennis@indexdata.dk>
Thu, 5 Aug 2010 12:33:42 +0000 (14:33 +0200)
post.jar: java client for POSTing
loc.sh: script to convert from gz'ed marc records to turbomarc, parpar2 (internal) format to SOLR document format,
and then POSTing it to a SOLR server

Ignore temporary files.

solr/.gitignore [new file with mode: 0644]
solr/loc.sh [new file with mode: 0755]
solr/post.jar [new file with mode: 0644]

diff --git a/solr/.gitignore b/solr/.gitignore
new file mode 100644 (file)
index 0000000..41134d6
--- /dev/null
@@ -0,0 +1,3 @@
+index.log
+part*.dat.gz.*
+data
\ No newline at end of file
diff --git a/solr/loc.sh b/solr/loc.sh
new file mode 100755 (executable)
index 0000000..e561896
--- /dev/null
@@ -0,0 +1,63 @@
+#!/bin/bash
+LOG=index.log
+MARCDUMP="yaz-marcdump"
+
+if [ "$SOLR_URL" == "" ] ; then 
+    DEF_HOST=-Durl="http://localhost:8983/solr/update" 
+else
+    DEF_HOST=-Durl="$SOLR_URL"
+fi
+
+if [ -d "./data" ] ; then
+        LOCDATA="./data"
+else
+        LOCDATA=/extra/heikki/locdata
+fi
+
+if [ ! -d "$LOCDATA" ] ; then
+       echo "$LOCDATA not a directory"
+       exit 1
+fi
+
+if [ "$1" == "" ] ; then 
+    FILES="$LOCDATA/part*"
+else 
+    FILES="$*"
+fi 
+#echo $FILES
+
+rm -f $LOG
+
+function convert()
+{
+    FILE=$2
+    echo "zcat $1 > $FILE.mrc" 
+    zcat $1 > $FILE.mrc
+    $MARCDUMP  -f marc8 -t utf-8 -o turbomarc $FILE.mrc > $FILE.xml
+    xsltproc ../test/tmarc.xsl $FILE.xml  > $FILE.pz 
+    xsltproc ../etc/pz2-solr.xsl $FILE.pz > $FILE.solr
+    ls -l $FILE.* >> $LOG
+}
+
+if [ "$TWO_PASS" == "1" ] ; then 
+    for d in ${FILES} ; do
+       date  "+%c converting $d" >>$LOG
+       FILE=`basename $1`
+       convert $d $FILE
+    done
+fi
+
+for d in ${FILES} ; do
+       date  "+%c converting $d" >>$LOG
+       BASE=`basename $d`
+       FILE=$BASE.solr
+       if [ ! -f "$FILE" ] ; then
+           convert $d $BASE
+       fi
+       date  "+%c indexing $d" >>$LOG
+       java $DEF_HOST -jar post.jar $FILE
+       date  "+%c indexing $d ended" >>$LOG
+       #rm tmp.*
+done
+date  "+%c All done" >>$LOG
+exit 0
diff --git a/solr/post.jar b/solr/post.jar
new file mode 100644 (file)
index 0000000..c1aff38
Binary files /dev/null and b/solr/post.jar differ