X-Git-Url: http://git.indexdata.com/?p=irspy-moved-to-github.git;a=blobdiff_plain;f=lib%2FZOOM%2FIRSpy%2FUtils.pm;h=0fc656aed69792025273e3a51f6a9d1f83b3fabe;hp=371a7dedeb294f6301dd9be634aa063f75a01ca7;hb=3d8efa47efc70e2d04f5c8325b2b72f0612f0dc2;hpb=a60fab660bf81bc545945bf48598971200d14e1c diff --git a/lib/ZOOM/IRSpy/Utils.pm b/lib/ZOOM/IRSpy/Utils.pm index 371a7de..0fc656a 100644 --- a/lib/ZOOM/IRSpy/Utils.pm +++ b/lib/ZOOM/IRSpy/Utils.pm @@ -1,4 +1,3 @@ -# $Id: Utils.pm,v 1.20 2006-12-11 13:58:17 sondberg Exp $ package ZOOM::IRSpy::Utils; @@ -6,21 +5,98 @@ use 5.008; use strict; use warnings; +use Scalar::Util; + use Exporter 'import'; -our @EXPORT_OK = qw(isodate +our @EXPORT_OK = qw(utf8param + trimField + isodate xml_encode cql_quote cql_target irspy_xpath_context - modify_xml_document); + irspy_make_identifier + irspy_record2identifier + irspy_identifier2target + modify_xml_document + bib1_access_point + render_record + calc_reliability_string + calc_reliability_stats); use XML::LibXML; use XML::LibXML::XPathContext; +use Encode; +use Encode qw(is_utf8); + our $IRSPY_NS = 'http://indexdata.com/irspy/1.0'; +# Under Apache 2/mod_perl 2, the ubiquitous $r is no longer and +# Apache::Request object, nor even an Apache2::Request, but an +# Apache2::RequestReq ... which, astonishingly, doesn't have the +# param() method. So if we're given one of these things, we need to +# make an Apache::Request out of, which at least isn't too hard. +# However *sigh* this may not be a cheap operation, so we keep a cache +# of already-made Request objects. +# +my %_apache2request; +my %_paramsbyrequest; # Used for Apache2 only +sub utf8param { + my($r, $key, $value) = @_; + + if ($r->isa('Apache2::RequestRec')) { + # Running under Apache2 + if (defined $_apache2request{$r}) { + #warn "using existing Apache2::RequestReq for '$r'"; + $r = $_apache2request{$r}; + } else { + require Apache2::Request; + #warn "making new Apache2::RequestReq for '$r'"; + $r = $_apache2request{$r} = new Apache2::Request($r); + } + } + + if (!defined $key) { + return map { decode_utf8($_) } $r->param(); + } + + my $raw = undef; + $raw = $_paramsbyrequest{$r}->{$key} if $r->isa('Apache2::Request'); + $raw = $r->param($key) if !defined $raw; + + if (defined $value) { + # Argh! Simply writing through to the underlying method + # param() won't work in Apache2, where param() is readonly. + # So we have to keep a hash of additional values, which we + # consult (above) before the actual parameters. Ouch ouch. + if ($r->isa('Apache2::Request')) { + $_paramsbyrequest{$r}->{$key} = encode_utf8($value); + } else { + $r->param($key, encode_utf8($value)); + } + } + + return undef if !defined $raw; + my $cooked = decode_utf8($raw); + warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw; + return $cooked; +} + # Utility functions follow, exported for use of web UI +sub utf8param_apache1 { + my($r, $key, $value) = @_; + die "utf8param() called with value '$value'" if defined $value; + + my $raw = $r->param($key); + return undef if !defined $raw; + my $cooked = decode_utf8($raw); + warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw; + return $cooked; +} + + sub isodate { my($time) = @_; @@ -29,6 +105,15 @@ sub isodate { $year+1900, $mon+1, $mday, $hour, $min, $sec); } +# strips whitespaces and start and ends of a field +sub trimField { + my $field = shift; + + $field =~ s/^\s+//; + $field =~ s/\s+$//; + + return $field; +} # I can't -- just can't, can't, can't -- believe that this function # isn't provided by one of the core XML modules. But the evidence all @@ -65,19 +150,25 @@ sub xml_encode { sub cql_quote { my($term) = @_; - $term =~ s/([""\\])/\\$1/g; - $term = qq["$term"] if $term =~ /\s/; + $term =~ s/([""\\*?])/\\$1/g; + $term = qq["$term"] if $term =~ /[\s""\/]/; return $term; } -# Makes a CQL query that finds a specified target +# Makes a CQL query that finds a specified target. Arguments may be +# either an ID alone, or a (host, port, db) triple. sub cql_target { - my($host, $port, $db) = @_; + my($protocol, $host, $port, $db) = @_; - return ("host=" . cql_quote($host) . " and " . - "port=" . cql_quote($port) . " and " . - "path=" . cql_quote($db)); + my $id; + if (defined $host) { + $id = irspy_make_identifier($protocol, $host, $port, $db); + } else { + $id = $protocol; + } + + return "rec.id=" . cql_quote($id); } @@ -104,10 +195,19 @@ sub irspy_namespace { sub irspy_xpath_context { my($record) = @_; - my $xml = ref $record ? $record->render() : $record; - my $parser = new XML::LibXML(); - my $doc = $parser->parse_string($xml); - my $root = $doc->getDocumentElement(); + if (ref $record && $record->isa("ZOOM::Record")) { + $record = $record->render(); + } + + my $root; + if (ref $record) { + $root = $record; + } else { + my $parser = new XML::LibXML(); + my $doc = $parser->parse_string($record); + $root = $doc->getDocumentElement(); + } + my $xc = XML::LibXML::XPathContext->new($root); foreach my $prefix (keys %_namespaces) { $xc->registerNs($prefix, $_namespaces{$prefix}); @@ -116,6 +216,81 @@ sub irspy_xpath_context { } +# Construct an opaque identifier from its components. Although it's +# trivial, this is needed in so many places that it really needs to be +# factored out. +# +# This is the converse of _parse_target_string() in IRSpy.pm, which +# should be renamed and moved into this package. +# +sub irspy_make_identifier { + my($protocol, $host, $port, $dbname) = @_; + + die "irspy_make_identifier(" . join(", ", map { "'$_'" } @_). + "): wrong number of arguments" if @_ != 4; + + die "irspy_make_identifier(): protocol undefined" if !defined $protocol; + die "irspy_make_identifier(): host undefined" if !defined $host; + die "irspy_make_identifier(): port undefined" if !defined $port; + die "irspy_make_identifier(): dbname undefined" if !defined $dbname; + + return "$protocol:$host:$port/$dbname"; +} + + +# Returns the opaque identifier of an IRSpy record based on the +# XPathContext'ed DOM object, as returned by irspy_xpath_context(). +# This is doing the same thing as irspy_make_identifier() but from a +# record rather than a set of parameters. +# +sub irspy_record2identifier { + my($xc) = @_; + + ### Must be kept the same as is used in ../../../zebra/*.xsl + return $xc->find("concat(e:serverInfo/\@protocol, ':', + e:serverInfo/e:host, ':', + e:serverInfo/e:port, '/', + e:serverInfo/e:database)"); +} + + +# Transforms an IRSpy opqaue identifier, as returned from +# irspy_make_identifier() or irspy_record2identifier(), into a YAZ +# target-string suitable for feeding to ZOOM. Before we introduced +# the protocol element at the start of the identifier string, this was +# a null transform; now we have to be a bit cleverer. +# +sub irspy_identifier2target { + my $res = _irspy_identifier2target(@_); + #carp "converted ID '@_' to target '$res'"; + return $res; +} + +sub _irspy_identifier2target { + my($id) = @_; + + confess "_irspy_identifier2target(): id is undefined" + if !defined $id; + + my($protocol, $target) = ($id =~ /(.*?):(.*)/); + if (uc($protocol) eq "Z39.50" || uc($protocol) eq "TCP") { + return "tcp:$target"; + } elsif (uc($protocol) eq "SRU") { + return "sru=get,http:$target"; + } elsif (uc($protocol) eq "SRW") { + return "sru=srw,http:$target"; + } + + warn "_irspy_identifier2target($id): unrecognised protocol '$protocol'"; + return $target; +} + + +# Modifies the XML document for which $xc is an XPath context by +# inserting or replacing the values specified in the hash %$data. The +# keys are fieldnames, which are looked up in the register +# $fieldsByKey to determine, among other things, what their XPath is. + sub modify_xml_document { my($xc, $fieldsByKey, $data) = @_; @@ -145,21 +320,22 @@ sub modify_xml_document { # we'll check whether the element is already # canonical, to determine whether our change is a # no-op. - my $old = "???"; + my $old = ""; my @children = $node->childNodes(); if (@children == 1) { my $child = $node->firstChild(); if (ref $child && ref $child eq "XML::LibXML::Text") { $old = $child->getData(); - next if $value eq $old; + #print STDERR "child='$child', old=", _renderchars($old), "\n" if $key eq "title"; } } + next if $value eq $old; $node->removeChildNodes(); my $child = new XML::LibXML::Text($value); $node->appendChild($child); push @changes, $ref; - #print "Elem $key: '$old' -> '$value' ($xpath)
\n"; + #print STDERR "Elem $key ($xpath): ", _renderchars($old), " -> '", _renderchars($value), "\n"; } else { warn "unexpected node type $node"; } @@ -177,6 +353,13 @@ sub modify_xml_document { } +sub _renderchars { + my($text) = @_; + + return "'" . $text . "'", " (", join(" ", map {ord($_)} split //, $text), "), is_utf8=" , is_utf8($text); +} + + sub dom_add_node { my($xc, $ppath, $selector, $value, @addAfter) = @_; @@ -288,7 +471,342 @@ sub inheritance_tree { # This function is made available in xslt using the register_function call sub xslt_strcmp { my ($arg1, $arg2) = @_; - return ($arg1->to_literal()) cmp ($arg2->to_literal()); + return "$arg1" cmp "$arg2"; +} + + +### It feels like this should be in YAZ, exported via ZOOM-Perl. +my %_bib1_access_point = ( + 1 => "Personal name", + 2 => "Corporate name", + 3 => "Conference name", + 4 => "Title", + 5 => "Title series", + 6 => "Title uniform", + 7 => "ISBN", + 8 => "ISSN", + 9 => "LC card number", + 10 => "BNB card no.", + 11 => "BGF number", + 12 => "Local number", + 13 => "Dewey classification", + 14 => "UDC classification", + 15 => "Bliss classification", + 16 => "LC call number", + 17 => "NLM call number", + 18 => "NAL call number", + 19 => "MOS call number", + 20 => "Local classification", + 21 => "Subject heading", + 22 => "Subject Rameau", + 23 => "BDI index subject", + 24 => "INSPEC subject", + 25 => "MESH subject", + 26 => "PA subject", + 27 => "LC subject heading", + 28 => "RVM subject heading", + 29 => "Local subject index", + 30 => "Date", + 31 => "Date of publication", + 32 => "Date of acquisition", + 33 => "Title key", + 34 => "Title collective", + 35 => "Title parallel", + 36 => "Title cover", + 37 => "Title added title page", + 38 => "Title caption", + 39 => "Title running", + 40 => "Title spine", + 41 => "Title other variant", + 42 => "Title former", + 43 => "Title abbreviated", + 44 => "Title expanded", + 45 => "Subject precis", + 46 => "Subject rswk", + 47 => "Subject subdivision", + 48 => "No. nat'l biblio.", + 49 => "No. legal deposit", + 50 => "No. govt pub.", + 51 => "No. music publisher", + 52 => "Number db", + 53 => "Number local call", + 54 => "Code--language", + 55 => "Code--geographic area", + 56 => "Code--institution", + 57 => "Name and title *", + 58 => "Name geographic", + 59 => "Place publication", + 60 => "CODEN", + 61 => "Microform generation", + 62 => "Abstract", + 63 => "Note", + 1000 => "Author-title", + 1001 => "Record type", + 1002 => "Name", + 1003 => "Author", + 1004 => "Author-name personal", + 1005 => "Author-name corporate", + 1006 => "Author-name conference", + 1007 => "Identifier--standard", + 1008 => "Subject--LC children's", + 1009 => "Subject name -- personal", + 1010 => "Body of text", + 1011 => "Date/time added to db", + 1012 => "Date/time last modified", + 1013 => "Authority/format id", + 1014 => "Concept-text", + 1015 => "Concept-reference", + 1016 => "Any", + 1017 => "Server-choice", + 1018 => "Publisher", + 1019 => "Record-source", + 1020 => "Editor", + 1021 => "Bib-level", + 1022 => "Geographic-class", + 1023 => "Indexed-by", + 1024 => "Map-scale", + 1025 => "Music-key", + 1026 => "Related-periodical", + 1027 => "Report-number", + 1028 => "Stock-number", + 1030 => "Thematic-number", + 1031 => "Material-type", + 1032 => "Doc-id", + 1033 => "Host-item", + 1034 => "Content-type", + 1035 => "Anywhere", + 1036 => "Author-Title-Subject", + 1032 => "Doc-id (semantic definition change)", + 1037 => "SICI", + 1038 => "Abstract-language", + 1039 => "Application-kind", + 1040 => "Classification", + 1041 => "Classification-basic", + 1042 => "Classification-local-record", + 1043 => "Enzyme", + 1044 => "Possessing-institution", + 1045 => "Record-linking", + 1046 => "Record-status", + 1047 => "Treatment", + 1048 => "Control-number-GKD", + 1049 => "Control-number-linking", + 1050 => "Control-number-PND", + 1051 => "Control-number-SWD", + 1052 => "Control-number-ZDB", + 1053 => "Country-publication (country of Publication)", + 1054 => "Date-conference (meeting date)", + 1055 => "Date-record-status", + 1056 => "Dissertation-information", + 1057 => "Meeting-organizer", + 1058 => "Note-availability", + 1059 => "Number-CAS-registry (CAS registry number)", + 1060 => "Number-document (document number)", + 1061 => "Number-local-accounting", + 1062 => "Number-local-acquisition", + 1063 => "Number-local-call-copy-specific", + 1064 => "Number-of-reference (reference count)", + 1065 => "Number-norm", + 1066 => "Number-volume", + 1067 => "Place-conference (meeting location)", + 1068 => "Reference (references and footnotes)", + 1069 => "Referenced-journal (reference work)", + 1070 => "Section-code", + 1071 => "Section-heading", + 1072 => "Subject-GOO", + 1073 => "Subject-name-conference", + 1074 => "Subject-name-corporate", + 1075 => "Subject-genre/form", + 1076 => "Subject-name-geographical", + 1077 => "Subject--chronological", + 1078 => "Subject--title", + 1079 => "Subject--topical", + 1080 => "Subject-uncontrolled", + 1081 => "Terminology-chemical (chemical name)", + 1082 => "Title-translated", + 1083 => "Year-of-beginning", + 1084 => "Year-of-ending", + 1085 => "Subject-AGROVOC", + 1086 => "Subject-COMPASS", + 1087 => "Subject-EPT", + 1088 => "Subject-NAL", + 1089 => "Classification-BCM", + 1090 => "Classification-DB", + 1091 => "Identifier-ISRC", + 1092 => "Identifier-ISMN", + 1093 => "Identifier-ISRN", + 1094 => "Identifier-DOI", + 1095 => "Code-language-original", + 1096 => "Title-later", + 1097 => "DC-Title", + 1098 => "DC-Creator", + 1099 => "DC-Subject", + 1100 => "DC-Description", + 1101 => "DC-Publisher", + 1102 => "DC-Date", + 1103 => "DC-ResourceType", + 1104 => "DC-ResourceIdentifier", + 1105 => "DC-Language", + 1106 => "DC-OtherContributor", + 1107 => "DC-Format", + 1108 => "DC-Source", + 1109 => "DC-Relation", + 1110 => "DC-Coverage", + 1111 => "DC-RightsManagement", + 1112 => "Controlled Subject Index", + 1113 => "Subject Thesaurus", + 1114 => "Index Terms -- Controlled", + 1115 => "Controlled Term", + 1116 => "Spatial Domain", + 1117 => "Bounding Coordinates", + 1118 => "West Bounding Coordinate", + 1119 => "East Bounding Coordinate", + 1120 => "North Bounding Coordinate", + 1121 => "South Bounding Coordinate", + 1122 => "Place", + 1123 => "Place Keyword Thesaurus", + 1124 => "Place Keyword", + 1125 => "Time Period", + 1126 => "Time Period Textual", + 1127 => "Time Period Structured", + 1128 => "Beginning Date", + 1129 => "Ending Date", + 1130 => "Availability", + 1131 => "Distributor", + 1132 => "Distributor Name", + 1133 => "Distributor Organization", + 1134 => "Distributor Street Address", + 1135 => "Distributor City", + 1136 => "Distributor State or Province", + 1137 => "Distributor Zip or Postal Code", + 1138 => "Distributor Country", + 1139 => "Distributor Network Address", + 1140 => "Distributor Hours of Service", + 1141 => "Distributor Telephone", + 1142 => "Distributor Fax", + 1143 => "Resource Description", + 1144 => "Order Process", + 1145 => "Order Information", + 1146 => "Cost", + 1147 => "Cost Information", + 1148 => "Technical Prerequisites", + 1149 => "Available Time Period", + 1150 => "Available Time Textual", + 1151 => "Available Time Structured", + 1152 => "Available Linkage", + 1153 => "Linkage Type", + 1154 => "Linkage", + 1155 => "Sources of Data", + 1156 => "Methodology", + 1157 => "Access Constraints", + 1158 => "General Access Constraints", + 1159 => "Originator Dissemination Control", + 1160 => "Security Classification Control", + 1161 => "Use Constraints", + 1162 => "Point of Contact", + 1163 => "Contact Name", + 1164 => "Contact Organization", + 1165 => "Contact Street Address", + 1166 => "Contact City", + 1167 => "Contact State or Province", + 1168 => "Contact Zip or Postal Code", + 1169 => "Contact Country", + 1170 => "Contact Network Address", + 1171 => "Contact Hours of Service", + 1172 => "Contact Telephone", + 1173 => "Contact Fax", + 1174 => "Supplemental Information", + 1175 => "Purpose", + 1176 => "Agency Program", + 1177 => "Cross Reference", + 1178 => "Cross Reference Title", + 1179 => "Cross Reference Relationship", + 1180 => "Cross Reference Linkage", + 1181 => "Schedule Number", + 1182 => "Original Control Identifier", + 1183 => "Language of Record", + 1184 => "Record Review Date", + 1185 => "Performer", + 1186 => "Performer-Individual", + 1187 => "Performer-Group", + 1188 => "Instrumentation", + 1189 => "Instrumentation-Original", + 1190 => "Instrumentation-Current", + 1191 => "Arrangement", + 1192 => "Arrangement-Original", + 1193 => "Arrangement-Current", + 1194 => "Musical Key-Original", + 1195 => "Musical Key-Current", + 1196 => "Date-Composition", + 1197 => "Date-Recording", + 1198 => "Place-Recording", + 1199 => "Country-Recording", + 1200 => "Number-ISWC", + 1201 => "Number-Matrix", + 1202 => "Number-Plate", + 1203 => "Classification-McColvin", + 1204 => "Duration", + 1205 => "Number-Copies", + 1206 => "Musical Theme", + 1207 => "Instruments - total number", + 1208 => "Instruments - distinct number", + 1209 => "Identifier - URN", + 1210 => "Sears Subject Heading", + 1211 => "OCLC Number", + 1212 => "Composition", + 1213 => "Intellectual level", + 1214 => "EAN", + 1215 => "NLC", + 1216 => "CRCS", + 1217 => "Nationality", + 1218 => "Equinox", + 1219 => "Compression", + 1220 => "Format", + 1221 => "Subject - occupation", + 1222 => "Subject - function", + 1223 => "Edition", +); + +sub bib1_access_point { + my($ap) = @_; + + return $_bib1_access_point{$ap} || + "unknown BIB-1 attribute '$ap'"; +} + + +sub render_record { + my($rs, $which, $elementSetName) = @_; + + # There is a slight race condition here on the element-set name, + # but it shouldn't be a problem as this is (currently) only called + # from parts of the program that run single-threaded. + my $old = $rs->option(elementSetName => $elementSetName); + my $rec = $rs->record($which); + $rs->option(elementSetName => $old); + + return $rec->render(); +} + + +sub calc_reliability_string { + my($xc) = @_; + + my($nok, $nall, $percent) = calc_reliability_stats($xc); + return "[untested]" if $nall == 0; + return "$nok/$nall = " . $percent . "%"; +} + + +sub calc_reliability_stats { + my($xc) = @_; + + my @allpings = $xc->findnodes("i:status/i:probe"); + my $nall = @allpings; + return (0, 0, 0) if $nall == 0; + my @okpings = $xc->findnodes('i:status/i:probe[@ok = "1"]'); + my $nok = @okpings; + my $percent = int(100*$nok/$nall); + return ($nok, $nall, $percent); }