Protect utf8param() from undefined values.

[irspy-moved-to-github.git] / lib / ZOOM / IRSpy / Utils.pm
diff --git a/lib/ZOOM/IRSpy/Utils.pm b/lib/ZOOM/IRSpy/Utils.pm

index ad98b15..2dbbe13 100644 (file)
--- a/lib/ZOOM/IRSpy/Utils.pm
+++ b/lib/ZOOM/IRSpy/Utils.pm
@@ -1,4 +1,4 @@
-# $Id: Utils.pm,v 1.21 2006-12-18 15:34:54 mike Exp $
+# $Id: Utils.pm,v 1.33 2007-06-27 10:44:57 mike Exp $
  
  package ZOOM::IRSpy::Utils;
  
@@ -7,21 +7,41 @@ use strict;
  use warnings;
  
  use Exporter 'import';
-our @EXPORT_OK = qw(isodate
+our @EXPORT_OK = qw(utf8param
+                   isodate
                     xml_encode 
                     cql_quote
                     cql_target
                     irspy_xpath_context
+                   irspy_make_identifier
+                   irspy_record2identifier
+                   irspy_identifier2target
                     modify_xml_document
-                   bib1_access_point);
+                   bib1_access_point
+                   render_record);
  
  use XML::LibXML;
  use XML::LibXML::XPathContext;
+use Encode;
+use Encode qw(is_utf8);
+
  
  our $IRSPY_NS = 'http://indexdata.com/irspy/1.0';
  
  
  # Utility functions follow, exported for use of web UI
+sub utf8param {
+    my($r, $key, $value) = @_;
+    die "utf8param() called with value '$value'" if defined $value;
+
+    my $raw = $r->param($key);
+    return undef if !defined $raw;
+    my $cooked = decode_utf8($raw);
+    warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw;
+    return $cooked;
+}
+
+
  sub isodate {
      my($time) = @_;
  
@@ -66,19 +86,25 @@ sub xml_encode {
  sub cql_quote {
      my($term) = @_;
  
-    $term =~ s/([""\\])/\\$1/g;
-    $term = qq["$term"] if $term =~ /\s/;
+    $term =~ s/([""\\*?])/\\$1/g;
+    $term = qq["$term"] if $term =~ /[\s""\/]/;
      return $term;
  }
  
  
-# Makes a CQL query that finds a specified target
+# Makes a CQL query that finds a specified target.  Arguments may be
+# either an ID alone, or a (host, port, db) triple.
  sub cql_target {
-    my($host, $port, $db) = @_;
+    my($protocol, $host, $port, $db) = @_;
  
-    return ("host=" . cql_quote($host) . " and " .
-           "port=" . cql_quote($port) . " and " .
-           "path=" . cql_quote($db));
+    my $id;
+    if (defined $host) {
+       $id = irspy_make_identifier($protocol, $host, $port, $db);
+    } else {
+       $id = $protocol;
+    }
+
+    return "rec.id=" . cql_quote($id);
  }
  
  
@@ -105,10 +131,19 @@ sub irspy_namespace {
  sub irspy_xpath_context {
      my($record) = @_;
  
-    my $xml = ref $record ? $record->render() : $record;
-    my $parser = new XML::LibXML();
-    my $doc = $parser->parse_string($xml);
-    my $root = $doc->getDocumentElement();
+    if (ref $record && $record->isa("ZOOM::Record")) {
+       $record = $record->render();
+    }
+
+    my $root;
+    if (ref $record) {
+       $root = $record;
+    } else {
+       my $parser = new XML::LibXML();
+       my $doc = $parser->parse_string($record);
+       $root = $doc->getDocumentElement();
+    }
+
      my $xc = XML::LibXML::XPathContext->new($root);
      foreach my $prefix (keys %_namespaces) {
         $xc->registerNs($prefix, $_namespaces{$prefix});
@@ -117,6 +152,73 @@ sub irspy_xpath_context {
  }
  
  
+# Construct an opaque identifier from its components.  Although it's
+# trivial, this is needed in so many places that it really needs to be
+# factored out.
+#
+# This is the converse of _parse_target_string() in IRSpy.pm, which
+# should be renamed and moved into this package.
+#
+sub irspy_make_identifier {
+    my($protocol, $host, $port, $dbname) = @_;
+
+    die "irspy_make_identifier(" . join(", ", map { "'$_'" } @_).
+       "): wrong number of arguments" if @_ != 4;
+
+    die "irspy_make_identifier(): protocol undefined" if !defined $protocol;
+    die "irspy_make_identifier(): host undefined" if !defined $host;
+    die "irspy_make_identifier(): port undefined" if !defined $port;
+    die "irspy_make_identifier(): dbname undefined" if !defined $dbname;
+
+    return "$protocol:$host:$port/$dbname";
+}
+
+
+# Returns the opaque identifier of an IRSpy record based on the
+# XPathContext'ed DOM object, as returned by irspy_xpath_context().
+# This is doing the same thing as irspy_make_identifier() but from a
+# record rather than a set of parameters.
+#
+sub irspy_record2identifier {
+    my($xc) = @_;
+
+    ### Must be kept the same as is used in ../../../zebra/*.xsl
+    return $xc->find("concat(e:serverInfo/\@protocol, ':',
+                            e:serverInfo/e:host, ':',
+                            e:serverInfo/e:port, '/',
+                            e:serverInfo/e:database)");
+}
+
+
+# Transforms an IRSpy opqaue identifier, as returned from
+# irspy_make_identifier() or irspy_record2identifier(), into a YAZ
+# target-string suitable for feeding to ZOOM.  Before we introduced
+# the protocol element at the start of the identifier string, this was
+# a null transform; now we have to be a bit cleverer.
+#
+sub irspy_identifier2target {
+    my $res = _irspy_identifier2target(@_);
+    #carp "converted ID '@_' to target '$res'";
+    return $res;
+}
+
+sub _irspy_identifier2target {
+    my($id) = @_;
+
+    my($protocol, $target) = ($id =~ /(.*?):(.*)/);
+    if (uc($protocol) eq "Z39.50") {
+       return "tcp:$target";
+    } elsif (uc($protocol) eq "SRU") {
+       return "sru=get,http:$target";
+    } elsif (uc($protocol) eq "SRW") {
+       return "sru=srw,http:$target";
+    }
+
+    warn "unrecognised protocol '$protocol' in ID $id";
+    return $target;
+}
+
+
  sub modify_xml_document {
      my($xc, $fieldsByKey, $data) = @_;
  
@@ -146,21 +248,23 @@ sub modify_xml_document {
                 # we'll check whether the element is already
                 # canonical, to determine whether our change is a
                 # no-op.
-               my $old = "???";
+               my $old = "";
                 my @children = $node->childNodes();
                 if (@children == 1) {
                     my $child = $node->firstChild();
                     if (ref $child && ref $child eq "XML::LibXML::Text") {
                         $old = $child->getData();
-                       next if $value eq $old;
+                       print STDERR "child='$child', old=", _renderchars($old), "\n"
+                           if $key eq "title";
                     }
                 }
+               next if $value eq $old;
  
                 $node->removeChildNodes();
                 my $child = new XML::LibXML::Text($value);
                 $node->appendChild($child);
                 push @changes, $ref;
-               #print "Elem $key: '$old' -> '$value' ($xpath)<br/>\n";
+               print STDERR "Elem $key ($xpath): ", _renderchars($old), " -> '", _renderchars($value), "\n";
             } else {
                 warn "unexpected node type $node";
             }
@@ -178,6 +282,13 @@ sub modify_xml_document {
  }
  
  
+sub _renderchars {
+    my($text) = @_;
+
+    return "'" . $text . "'", " (", join(" ", map {ord($_)} split //, $text), "), is_utf8=" , is_utf8($text);
+}
+
+
  sub dom_add_node {
      my($xc, $ppath, $selector, $value, @addAfter) = @_;
  
@@ -592,4 +703,18 @@ sub bib1_access_point {
  }
  
  
+sub render_record {
+    my($rs, $which, $elementSetName) = @_;
+
+    # There is a slight race condition here on the element-set name,
+    # but it shouldn't be a problem as this is (currently) only called
+    # from parts of the program that run single-threaded.
+    my $old = $rs->option(elementSetName => $elementSetName);
+    my $rec = $rs->record($which);
+    $rs->option(elementSetName => $old);
+
+    return $rec->render();
+}
+
+
  1;