X-Git-Url: http://git.indexdata.com/?p=irspy-moved-to-github.git;a=blobdiff_plain;f=lib%2FZOOM%2FIRSpy%2FUtils.pm;h=0fc656aed69792025273e3a51f6a9d1f83b3fabe;hp=371a7dedeb294f6301dd9be634aa063f75a01ca7;hb=3d8efa47efc70e2d04f5c8325b2b72f0612f0dc2;hpb=a60fab660bf81bc545945bf48598971200d14e1c

diff --git a/lib/ZOOM/IRSpy/Utils.pm b/lib/ZOOM/IRSpy/Utils.pm
index 371a7de..0fc656a 100644
--- a/lib/ZOOM/IRSpy/Utils.pm
+++ b/lib/ZOOM/IRSpy/Utils.pm
@@ -1,4 +1,3 @@
-# $Id: Utils.pm,v 1.20 2006-12-11 13:58:17 sondberg Exp $
 
 package ZOOM::IRSpy::Utils;
 
@@ -6,21 +5,98 @@ use 5.008;
 use strict;
 use warnings;
 
+use Scalar::Util;
+
 use Exporter 'import';
-our @EXPORT_OK = qw(isodate
+our @EXPORT_OK = qw(utf8param
+		    trimField
+		    isodate
 		    xml_encode 
 		    cql_quote
 		    cql_target
 		    irspy_xpath_context
-		    modify_xml_document);
+		    irspy_make_identifier
+		    irspy_record2identifier
+		    irspy_identifier2target
+		    modify_xml_document
+		    bib1_access_point
+		    render_record
+		    calc_reliability_string
+		    calc_reliability_stats);
 
 use XML::LibXML;
 use XML::LibXML::XPathContext;
+use Encode;
+use Encode qw(is_utf8);
+
 
 our $IRSPY_NS = 'http://indexdata.com/irspy/1.0';
 
+# Under Apache 2/mod_perl 2, the ubiquitous $r is no longer and
+# Apache::Request object, nor even an Apache2::Request, but an
+# Apache2::RequestReq ... which, astonishingly, doesn't have the
+# param() method.  So if we're given one of these things, we need to
+# make an Apache::Request out of, which at least isn't too hard.
+# However *sigh* this may not be a cheap operation, so we keep a cache
+# of already-made Request objects.
+#
+my %_apache2request;
+my %_paramsbyrequest;           # Used for Apache2 only
+sub utf8param {
+    my($r, $key, $value) = @_;
+
+    if ($r->isa('Apache2::RequestRec')) {
+        # Running under Apache2
+        if (defined $_apache2request{$r}) {
+            #warn "using existing Apache2::RequestReq for '$r'";
+            $r = $_apache2request{$r};
+        } else {
+            require Apache2::Request;
+            #warn "making new Apache2::RequestReq for '$r'";
+            $r = $_apache2request{$r} = new Apache2::Request($r);
+        }
+    }
+
+    if (!defined $key) {
+        return map { decode_utf8($_) } $r->param();
+    }
+
+    my $raw = undef;
+    $raw = $_paramsbyrequest{$r}->{$key} if $r->isa('Apache2::Request');
+    $raw = $r->param($key) if !defined $raw;
+
+    if (defined $value) {
+        # Argh!  Simply writing through to the underlying method
+        # param() won't work in Apache2, where param() is readonly.
+        # So we have to keep a hash of additional values, which we
+        # consult (above) before the actual parameters.  Ouch ouch.
+        if ($r->isa('Apache2::Request')) {
+            $_paramsbyrequest{$r}->{$key} = encode_utf8($value);
+        } else {
+            $r->param($key, encode_utf8($value));
+        }
+    }
+
+    return undef if !defined $raw;
+    my $cooked = decode_utf8($raw);
+    warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw;
+    return $cooked;
+}
+
 
 # Utility functions follow, exported for use of web UI
+sub utf8param_apache1 {
+    my($r, $key, $value) = @_;
+    die "utf8param() called with value '$value'" if defined $value;
+
+    my $raw = $r->param($key);
+    return undef if !defined $raw;
+    my $cooked = decode_utf8($raw);
+    warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw;
+    return $cooked;
+}
+
+
 sub isodate {
     my($time) = @_;
 
@@ -29,6 +105,15 @@ sub isodate {
 		   $year+1900, $mon+1, $mday, $hour, $min, $sec);
 }
 
+# strips whitespaces and start and ends of a field
+sub trimField {
+    my $field  = shift;
+
+    $field =~ s/^\s+//;
+    $field =~ s/\s+$//;
+
+    return $field;
+}
 
 # I can't -- just can't, can't, can't -- believe that this function
 # isn't provided by one of the core XML modules.  But the evidence all
@@ -65,19 +150,25 @@ sub xml_encode {
 sub cql_quote {
     my($term) = @_;
 
-    $term =~ s/([""\\])/\\$1/g;
-    $term = qq["$term"] if $term =~ /\s/;
+    $term =~ s/([""\\*?])/\\$1/g;
+    $term = qq["$term"] if $term =~ /[\s""\/]/;
     return $term;
 }
 
 
-# Makes a CQL query that finds a specified target
+# Makes a CQL query that finds a specified target.  Arguments may be
+# either an ID alone, or a (host, port, db) triple.
 sub cql_target {
-    my($host, $port, $db) = @_;
+    my($protocol, $host, $port, $db) = @_;
 
-    return ("host=" . cql_quote($host) . " and " .
-	    "port=" . cql_quote($port) . " and " .
-	    "path=" . cql_quote($db));
+    my $id;
+    if (defined $host) {
+	$id = irspy_make_identifier($protocol, $host, $port, $db);
+    } else {
+	$id = $protocol;
+    }
+
+    return "rec.id=" . cql_quote($id);
 }
 
 
@@ -104,10 +195,19 @@ sub irspy_namespace {
 sub irspy_xpath_context {
     my($record) = @_;
 
-    my $xml = ref $record ? $record->render() : $record;
-    my $parser = new XML::LibXML();
-    my $doc = $parser->parse_string($xml);
-    my $root = $doc->getDocumentElement();
+    if (ref $record && $record->isa("ZOOM::Record")) {
+	$record = $record->render();
+    }
+
+    my $root;
+    if (ref $record) {
+	$root = $record;
+    } else {
+	my $parser = new XML::LibXML();
+	my $doc = $parser->parse_string($record);
+	$root = $doc->getDocumentElement();
+    }
+
     my $xc = XML::LibXML::XPathContext->new($root);
     foreach my $prefix (keys %_namespaces) {
 	$xc->registerNs($prefix, $_namespaces{$prefix});
@@ -116,6 +216,81 @@ sub irspy_xpath_context {
 }
 
 
+# Construct an opaque identifier from its components.  Although it's
+# trivial, this is needed in so many places that it really needs to be
+# factored out.
+#
+# This is the converse of _parse_target_string() in IRSpy.pm, which
+# should be renamed and moved into this package.
+#
+sub irspy_make_identifier {
+    my($protocol, $host, $port, $dbname) = @_;
+
+    die "irspy_make_identifier(" . join(", ", map { "'$_'" } @_).
+	"): wrong number of arguments" if @_ != 4;
+
+    die "irspy_make_identifier(): protocol undefined" if !defined $protocol;
+    die "irspy_make_identifier(): host undefined" if !defined $host;
+    die "irspy_make_identifier(): port undefined" if !defined $port;
+    die "irspy_make_identifier(): dbname undefined" if !defined $dbname;
+
+    return "$protocol:$host:$port/$dbname";
+}
+
+
+# Returns the opaque identifier of an IRSpy record based on the
+# XPathContext'ed DOM object, as returned by irspy_xpath_context().
+# This is doing the same thing as irspy_make_identifier() but from a
+# record rather than a set of parameters.
+#
+sub irspy_record2identifier {
+    my($xc) = @_;
+
+    ### Must be kept the same as is used in ../../../zebra/*.xsl
+    return $xc->find("concat(e:serverInfo/\@protocol, ':',
+			     e:serverInfo/e:host, ':',
+			     e:serverInfo/e:port, '/',
+			     e:serverInfo/e:database)");
+}
+
+
+# Transforms an IRSpy opqaue identifier, as returned from
+# irspy_make_identifier() or irspy_record2identifier(), into a YAZ
+# target-string suitable for feeding to ZOOM.  Before we introduced
+# the protocol element at the start of the identifier string, this was
+# a null transform; now we have to be a bit cleverer.
+#
+sub irspy_identifier2target {
+    my $res = _irspy_identifier2target(@_);
+    #carp "converted ID '@_' to target '$res'";
+    return $res;
+}
+
+sub _irspy_identifier2target {
+    my($id) = @_;
+
+    confess "_irspy_identifier2target(): id is undefined"
+	if !defined $id;
+
+    my($protocol, $target) = ($id =~ /(.*?):(.*)/);
+    if (uc($protocol) eq "Z39.50" || uc($protocol) eq "TCP") {
+	return "tcp:$target";
+    } elsif (uc($protocol) eq "SRU") {
+	return "sru=get,http:$target";
+    } elsif (uc($protocol) eq "SRW") {
+	return "sru=srw,http:$target";
+    }
+
+    warn "_irspy_identifier2target($id): unrecognised protocol '$protocol'";
+    return $target;
+}
+
+
+# Modifies the XML document for which $xc is an XPath context by
+# inserting or replacing the values specified in the hash %$data.  The
+# keys are fieldnames, which are looked up in the register
+# $fieldsByKey to determine, among other things, what their XPath is.
+
 sub modify_xml_document {
     my($xc, $fieldsByKey, $data) = @_;
 
@@ -145,21 +320,22 @@ sub modify_xml_document {
 		# we'll check whether the element is already
 		# canonical, to determine whether our change is a
 		# no-op.
-		my $old = "???";
+		my $old = "";
 		my @children = $node->childNodes();
 		if (@children == 1) {
 		    my $child = $node->firstChild();
 		    if (ref $child && ref $child eq "XML::LibXML::Text") {
 			$old = $child->getData();
-			next if $value eq $old;
+			#print STDERR "child='$child', old=", _renderchars($old), "\n" if $key eq "title";
 		    }
 		}
+		next if $value eq $old;
 
 		$node->removeChildNodes();
 		my $child = new XML::LibXML::Text($value);
 		$node->appendChild($child);
 		push @changes, $ref;
-		#print "Elem $key: '$old' -> '$value' ($xpath)<br/>\n";
+		#print STDERR "Elem $key ($xpath): ", _renderchars($old), " -> '", _renderchars($value), "\n";
 	    } else {
 		warn "unexpected node type $node";
 	    }
@@ -177,6 +353,13 @@ sub modify_xml_document {
 }
 
 
+sub _renderchars {
+    my($text) = @_;
+
+    return "'" . $text . "'", " (", join(" ", map {ord($_)} split //, $text), "), is_utf8=" , is_utf8($text);
+}
+
+
 sub dom_add_node {
     my($xc, $ppath, $selector, $value, @addAfter) = @_;
 
@@ -288,7 +471,342 @@ sub inheritance_tree {
 # This function is made available in xslt using the register_function call
 sub xslt_strcmp {
     my ($arg1, $arg2) = @_;
-    return ($arg1->to_literal()) cmp ($arg2->to_literal());
+    return "$arg1" cmp "$arg2";
+}
+
+
+### It feels like this should be in YAZ, exported via ZOOM-Perl.
+my %_bib1_access_point = (
+	1 =>	"Personal name",
+	2 =>	"Corporate name",
+	3 =>	"Conference name",
+	4 =>	"Title",
+	5 =>	"Title series",
+	6 =>	"Title uniform",
+	7 =>	"ISBN",
+	8 =>	"ISSN",
+	9 =>	"LC card number",
+	10 =>	"BNB card no.",
+	11 =>	"BGF number",
+	12 =>	"Local number",
+	13 =>	"Dewey classification",
+	14 =>	"UDC classification",
+	15 =>	"Bliss classification",
+	16 =>	"LC call number",
+	17 =>	"NLM call number",
+	18 =>	"NAL call number",
+	19 =>	"MOS call number",
+	20 =>	"Local classification",
+	21 =>	"Subject heading",
+	22 =>	"Subject Rameau",
+	23 =>	"BDI index subject",
+	24 =>	"INSPEC subject",
+	25 =>	"MESH subject",
+	26 =>	"PA subject",
+	27 =>	"LC subject heading",
+	28 =>	"RVM subject heading",
+	29 =>	"Local subject index",
+	30 =>	"Date",
+	31 =>	"Date of publication",
+	32 =>	"Date of acquisition",
+	33 =>	"Title key",
+	34 =>	"Title collective",
+	35 =>	"Title parallel",
+	36 =>	"Title cover",
+	37 =>	"Title added title page",
+	38 =>	"Title caption",
+	39 =>	"Title running",
+	40 =>	"Title spine",
+	41 =>	"Title other variant",
+	42 =>	"Title former",
+	43 =>	"Title abbreviated",
+	44 =>	"Title expanded",
+	45 =>	"Subject precis",
+	46 =>	"Subject rswk",
+	47 =>	"Subject subdivision",
+	48 =>	"No. nat'l biblio.",
+	49 =>	"No. legal deposit",
+	50 =>	"No. govt pub.",
+	51 =>	"No. music publisher",
+	52 =>	"Number db",
+	53 =>	"Number local call",
+	54 =>	"Code--language",
+	55 =>	"Code--geographic area",
+	56 =>	"Code--institution",
+	57 =>	"Name and title *",
+	58 =>	"Name geographic",
+	59 =>	"Place publication",
+	60 =>	"CODEN",
+	61 =>	"Microform generation",
+	62 =>	"Abstract",
+	63 =>	"Note",
+	1000 =>	"Author-title",
+	1001 =>	"Record type",
+	1002 =>	"Name",
+	1003 =>	"Author",
+	1004 =>	"Author-name personal",
+	1005 =>	"Author-name corporate",
+	1006 =>	"Author-name conference",
+	1007 =>	"Identifier--standard",
+	1008 =>	"Subject--LC children's",
+	1009 =>	"Subject name -- personal",
+	1010 =>	"Body of text",
+	1011 =>	"Date/time added to db",
+	1012 =>	"Date/time last modified",
+	1013 =>	"Authority/format id",
+	1014 =>	"Concept-text",
+	1015 =>	"Concept-reference",
+	1016 =>	"Any",
+	1017 =>	"Server-choice",
+	1018 =>	"Publisher",
+	1019 =>	"Record-source",
+	1020 =>	"Editor",
+	1021 =>	"Bib-level",
+	1022 =>	"Geographic-class",
+	1023 =>	"Indexed-by",
+	1024 =>	"Map-scale",
+	1025 =>	"Music-key",
+	1026 =>	"Related-periodical",
+	1027 =>	"Report-number",
+	1028 =>	"Stock-number",
+	1030 =>	"Thematic-number",
+	1031 =>	"Material-type",
+	1032 =>	"Doc-id",
+	1033 =>	"Host-item",
+	1034 =>	"Content-type",
+	1035 =>	"Anywhere",
+	1036 =>	"Author-Title-Subject",
+	1032 =>	"Doc-id (semantic definition change)",
+	1037 =>	"SICI",
+	1038 =>	"Abstract-language",
+	1039 =>	"Application-kind",
+	1040 =>	"Classification",
+	1041 =>	"Classification-basic",
+	1042 =>	"Classification-local-record",
+	1043 =>	"Enzyme",
+	1044 =>	"Possessing-institution",
+	1045 =>	"Record-linking",
+	1046 =>	"Record-status",
+	1047 =>	"Treatment",
+	1048 =>	"Control-number-GKD",
+	1049 =>	"Control-number-linking",
+	1050 =>	"Control-number-PND",
+	1051 =>	"Control-number-SWD",
+	1052 =>	"Control-number-ZDB",
+	1053 =>	"Country-publication (country of Publication)",
+	1054 =>	"Date-conference (meeting date)",
+	1055 =>	"Date-record-status",
+	1056 =>	"Dissertation-information",
+	1057 =>	"Meeting-organizer",
+	1058 =>	"Note-availability",
+	1059 =>	"Number-CAS-registry (CAS registry number)",
+	1060 =>	"Number-document (document number)",
+	1061 =>	"Number-local-accounting",
+	1062 =>	"Number-local-acquisition",
+	1063 =>	"Number-local-call-copy-specific",
+	1064 =>	"Number-of-reference (reference count)",
+	1065 =>	"Number-norm",
+	1066 =>	"Number-volume",
+	1067 =>	"Place-conference (meeting location)",
+	1068 =>	"Reference (references and footnotes)",
+	1069 =>	"Referenced-journal (reference work)",
+	1070 =>	"Section-code",
+	1071 =>	"Section-heading",
+	1072 =>	"Subject-GOO",
+	1073 =>	"Subject-name-conference",
+	1074 =>	"Subject-name-corporate",
+	1075 =>	"Subject-genre/form",
+	1076 =>	"Subject-name-geographical",
+	1077 =>	"Subject--chronological",
+	1078 =>	"Subject--title",
+	1079 =>	"Subject--topical",
+	1080 =>	"Subject-uncontrolled",
+	1081 =>	"Terminology-chemical (chemical name)",
+	1082 =>	"Title-translated",
+	1083 =>	"Year-of-beginning",
+	1084 =>	"Year-of-ending",
+	1085 =>	"Subject-AGROVOC",
+	1086 =>	"Subject-COMPASS",
+	1087 =>	"Subject-EPT",
+	1088 =>	"Subject-NAL",
+	1089 =>	"Classification-BCM",
+	1090 =>	"Classification-DB",
+	1091 =>	"Identifier-ISRC",
+	1092 =>	"Identifier-ISMN",
+	1093 =>	"Identifier-ISRN",
+	1094 =>	"Identifier-DOI",
+	1095 =>	"Code-language-original",
+	1096 =>	"Title-later",
+	1097 =>	"DC-Title",
+	1098 =>	"DC-Creator",
+	1099 =>	"DC-Subject",
+	1100 =>	"DC-Description",
+	1101 =>	"DC-Publisher",
+	1102 =>	"DC-Date",
+	1103 =>	"DC-ResourceType",
+	1104 =>	"DC-ResourceIdentifier",
+	1105 =>	"DC-Language",
+	1106 =>	"DC-OtherContributor",
+	1107 =>	"DC-Format",
+	1108 =>	"DC-Source",
+	1109 =>	"DC-Relation",
+	1110 =>	"DC-Coverage",
+	1111 =>	"DC-RightsManagement",
+	1112 =>	"Controlled Subject Index",
+	1113 =>	"Subject Thesaurus",
+	1114 =>	"Index Terms -- Controlled",
+	1115 =>	"Controlled Term",
+	1116 =>	"Spatial Domain",
+	1117 =>	"Bounding Coordinates",
+	1118 =>	"West Bounding Coordinate",
+	1119 =>	"East Bounding Coordinate",
+	1120 =>	"North Bounding Coordinate",
+	1121 =>	"South Bounding Coordinate",
+	1122 =>	"Place",
+	1123 =>	"Place Keyword Thesaurus",
+	1124 =>	"Place Keyword",
+	1125 =>	"Time Period",
+	1126 =>	"Time Period Textual",
+	1127 =>	"Time Period Structured",
+	1128 =>	"Beginning Date",
+	1129 =>	"Ending Date",
+	1130 =>	"Availability",
+	1131 =>	"Distributor",
+	1132 =>	"Distributor Name",
+	1133 =>	"Distributor Organization",
+	1134 =>	"Distributor Street Address",
+	1135 =>	"Distributor City",
+	1136 =>	"Distributor State or Province",
+	1137 =>	"Distributor Zip or Postal Code",
+	1138 =>	"Distributor Country",
+	1139 =>	"Distributor Network Address",
+	1140 =>	"Distributor Hours of Service",
+	1141 =>	"Distributor Telephone",
+	1142 =>	"Distributor Fax",
+	1143 =>	"Resource Description",
+	1144 =>	"Order Process",
+	1145 =>	"Order Information",
+	1146 =>	"Cost",
+	1147 =>	"Cost Information",
+	1148 =>	"Technical Prerequisites",
+	1149 =>	"Available Time Period",
+	1150 =>	"Available Time Textual",
+	1151 =>	"Available Time Structured",
+	1152 =>	"Available Linkage",
+	1153 =>	"Linkage Type",
+	1154 =>	"Linkage",
+	1155 =>	"Sources of Data",
+	1156 =>	"Methodology",
+	1157 =>	"Access Constraints",
+	1158 =>	"General Access Constraints",
+	1159 =>	"Originator Dissemination Control",
+	1160 =>	"Security Classification Control",
+	1161 =>	"Use Constraints",
+	1162 =>	"Point of Contact",
+	1163 =>	"Contact Name",
+	1164 =>	"Contact Organization",
+	1165 =>	"Contact Street Address",
+	1166 =>	"Contact City",
+	1167 =>	"Contact State or Province",
+	1168 =>	"Contact Zip or Postal Code",
+	1169 =>	"Contact Country",
+	1170 =>	"Contact Network Address",
+	1171 =>	"Contact Hours of Service",
+	1172 =>	"Contact Telephone",
+	1173 =>	"Contact Fax",
+	1174 =>	"Supplemental Information",
+	1175 =>	"Purpose",
+	1176 =>	"Agency Program",
+	1177 =>	"Cross Reference",
+	1178 =>	"Cross Reference Title",
+	1179 =>	"Cross Reference Relationship",
+	1180 =>	"Cross Reference Linkage",
+	1181 =>	"Schedule Number",
+	1182 =>	"Original Control Identifier",
+	1183 =>	"Language of Record",
+	1184 =>	"Record Review Date",
+	1185 =>	"Performer",
+	1186 =>	"Performer-Individual",
+	1187 =>	"Performer-Group",
+	1188 =>	"Instrumentation",
+	1189 =>	"Instrumentation-Original",
+	1190 =>	"Instrumentation-Current",
+	1191 =>	"Arrangement",
+	1192 =>	"Arrangement-Original",
+	1193 =>	"Arrangement-Current",
+	1194 =>	"Musical Key-Original",
+	1195 =>	"Musical Key-Current",
+	1196 =>	"Date-Composition",
+	1197 =>	"Date-Recording",
+	1198 =>	"Place-Recording",
+	1199 =>	"Country-Recording",
+	1200 =>	"Number-ISWC",
+	1201 =>	"Number-Matrix",
+	1202 =>	"Number-Plate",
+	1203 =>	"Classification-McColvin",
+	1204 =>	"Duration",
+	1205 =>	"Number-Copies",
+	1206 =>	"Musical Theme",
+	1207 =>	"Instruments - total number",
+	1208 =>	"Instruments - distinct number",
+	1209 =>	"Identifier - URN",
+	1210 =>	"Sears Subject Heading",
+	1211 =>	"OCLC Number",
+	1212 =>	"Composition",
+	1213 =>	"Intellectual level",
+	1214 =>	"EAN",
+	1215 =>	"NLC",
+	1216 =>	"CRCS",
+	1217 =>	"Nationality",
+	1218 =>	"Equinox",
+	1219 =>	"Compression",
+	1220 =>	"Format",
+	1221 =>	"Subject - occupation",
+	1222 =>	"Subject - function",
+	1223 =>	"Edition",
+);
+
+sub bib1_access_point {
+    my($ap) = @_;
+
+    return $_bib1_access_point{$ap} ||
+	"unknown BIB-1 attribute '$ap'";
+}
+
+
+sub render_record {
+    my($rs, $which, $elementSetName) = @_;
+
+    # There is a slight race condition here on the element-set name,
+    # but it shouldn't be a problem as this is (currently) only called
+    # from parts of the program that run single-threaded.
+    my $old = $rs->option(elementSetName => $elementSetName);
+    my $rec = $rs->record($which);
+    $rs->option(elementSetName => $old);
+
+    return $rec->render();
+}
+
+
+sub calc_reliability_string {
+    my($xc) = @_;
+
+    my($nok, $nall, $percent) = calc_reliability_stats($xc);
+    return "[untested]" if $nall == 0;
+    return "$nok/$nall = " . $percent . "%";
+}
+
+
+sub calc_reliability_stats {
+    my($xc) = @_;
+
+    my @allpings = $xc->findnodes("i:status/i:probe");
+    my $nall = @allpings;
+    return (0, 0, 0) if $nall == 0;
+    my @okpings = $xc->findnodes('i:status/i:probe[@ok = "1"]');
+    my $nok = @okpings;
+    my $percent = int(100*$nok/$nall);
+    return ($nok, $nall, $percent);
 }