Only enable ALVIS if Libxml2 2.6.15 or later is in use.

[idzebra-moved-to-github.git] / doc / examples.xml
diff --git a/doc/examples.xml b/doc/examples.xml

index f2af444..d37463f 100644 (file)
--- a/doc/examples.xml
+++ b/doc/examples.xml
@@ -1,16 +1,17 @@
  <chapter id="examples">
- <!-- $Id: examples.xml,v 1.8 2002-10-10 14:27:18 heikki Exp $ -->
+ <!-- $Id: examples.xml,v 1.24 2006-09-22 12:34:45 adam Exp $ -->
   <title>Example Configurations</title>
  
- <sect1>
+ <sect1 id="examples-overview">
    <title>Overview</title>
  
    <para>
-   <literal>zebraidx</literal> and <literal>zebrasrv</literal> are both
+   <command>zebraidx</command> and 
+   <command>zebrasrv</command> are both
     driven by a master configuration file, which may refer to other
     subsidiary configuration files.  By default, they try to use
     <filename>zebra.cfg</filename> in the working directory as the
-   master file; but this can be changed using the <literal>-t</literal>
+   master file; but this can be changed using the <literal>-c</literal>
     option to specify an alternative master configuration file.
    </para>
    <para>
@@ -19,23 +20,35 @@
  
      <listitem>
       <para>
-      Where to find subsidiary configuration files, including
-      <literal>default.idx</literal>
+      Where to find subsidiary configuration files, including both
+      those that are named explicitly and a few ``magic'' files such
+      as <literal>default.idx</literal>,
        which specifies the default indexing rules.
       </para>
      </listitem>
  
      <listitem>
       <para>
-      What attribute sets to recognise in searches.
+      What record schemas to support.  (Subsidiary files specifiy how
+      to index the contents of records in those schemas, and what
+      format to use when presenting records in those schemas to client
+      software.)
       </para>
      </listitem>
  
      <listitem>
       <para>
-      Policy details such as what record type to expect, what
-      low-level indexing algorithm to use, how to identify potential
-      duplicate records, etc.
+      What attribute sets to recognise in searches.  (Subsidiary files
+      specify how to interpret the attributes in terms
+      of the indexes that are created on the records.)
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Policy details such as what type of input format to expect when
+      adding new records, what low-level indexing algorithm to use,
+      how to identify potential duplicate records, etc.
       </para>
      </listitem>
  
@@ -53,30 +66,38 @@
    <para>
     This example shows how Zebra can be used with absolutely minimal
     configuration to index a body of
-   <ulink url="http://www.w3.org/xml/###">XML</ulink>
+   <ulink url="&url.xml;">XML</ulink>
     documents, and search them using
-   <ulink url="http://www.w3.org/xpath/###">XPath</ulink>
+   <ulink url="&url.xpath;">XPath</ulink>
     expressions to specify access points.
    </para>
    <para>
-   Go to the <literal>examples/dinosauricon</literal> subdirectory
+   Go to the <literal>examples/zthes</literal> subdirectory
     of the distribution archive.
-   There you will find a <literal>records</literal> subdirectory,
-   which contains some raw XML data to be added to the database: in
-   this case, as single file, <literal>genera.xml</literal>,
-   which contain information about all the known dinosaur genera as of
-   August 2002.
+   There you will find a <literal>Makefile</literal> that will
+   populate the <literal>records</literal> subdirectory with a file of
+   <ulink url="http://zthes.z3950.org/">Zthes</ulink>
+   records representing a taxonomic hierarchy of dinosaurs.  (The
+   records are generated from the family tree in the file
+   <literal>dino.tree</literal>.)
+   Type <literal>make records/dino.xml</literal>
+   to make the XML data file.
+   (Or you could just type <literal>make dino</literal> to build the XML
+   data file, create the database and populate it with the taxonomic
+   records all in one shot - but then you wouldn't learn anything,
+   would you?  :-)
    </para>
    <para>
-   Now we need to create the Zebra database, which we do with the
-   Zebra indexer, <literal>zebraidx</literal>, which is
+   Now we need to create a Zebra database to hold and index the XML
+   records.  We do this with the
+   Zebra indexer, <command>zebraidx</command>, which is
     driven by the <literal>zebra.cfg</literal> configuration file.
     For our purposes, we don't need any
-   special behaviour - we can use the defaults - so we start with a
-   minimal file that just tells <literal>zebraidx</literal> where to
+   special behaviour - we can use the defaults - so we can start with a
+   minimal file that just tells <command>zebraidx</command> where to
     find the default indexing rules, and how to parse the records:
     <screen>
-    profilePath: .:../../tab:../../../yaz/tab
+    profilePath: .:../../tab
      recordType: grs.sgml
     </screen>
    </para>
@@ -104,98 +125,198 @@
     XPath-based boolean queries and fetch the XML records that satisfy
     them:
     <screen>
-    $ yaz-client tcp:@:9999
+    $ yaz-client @:9999
      Connecting...Ok.
-    Z&gt; find @attr 1=/GENUS/MEANING @and lizard earthquakes
+    Z&gt; find @attr 1=/Zthes/termName Sauroposeidon
      Number of hits: 1
      Z&gt; format xml
      Z&gt; show 1
-    &lt;GENUS name="Sauroposeidon" type="with"&gt;
-     &lt;MEANING&gt;lizard Poseidon &lt;LOW&gt;(Greek god of, among other things, earthquakes)&lt;/LOW&gt;&lt;/MEANING&gt;
-     &lt;SPECIES name="proteles"&gt;
-      &lt;AUTHOR type="vide" name="Franklin" year="2000"&gt;&lt;/AUTHOR&gt;
-      &lt;AUTHOR name="Wedel, Cifelli, Sanders"&gt;&lt;/AUTHOR&gt;
-     &lt;/SPECIES&gt;
-     &lt;PLACE name="Oklahoma"&gt;&lt;/PLACE&gt;
-     &lt;TIME value="Albian"&gt;&lt;/TIME&gt;
-     &lt;LENGTH value="30" q="1"&gt;&lt;/LENGTH&gt;
-     &lt;REMAINS content="rib, cervical vertebrae"&gt;&lt;/REMAINS&gt;
-     &lt;ESSAY&gt;
-      &lt;P&gt; This new &lt;NOMEN name="Brachiosaurus"&gt;&lt;/NOMEN&gt;-like &lt;LINK content="dinosaur"&gt;&lt;/LINK&gt;
-      was perhaps the tallest. With its head raised, it stood 60 feet (nearly
-      20 m) tall. &lt;/P&gt;
-     &lt;/ESSAY&gt;
+    &lt;Zthes&gt;
+     &lt;termId&gt;22&lt;/termId&gt;
+     &lt;termName&gt;Sauroposeidon&lt;/termName&gt;
+     &lt;termType&gt;PT&lt;/termType&gt;
+     &lt;termNote&gt;The tallest known dinosaur (18m)&lt;/termNote&gt;
+     &lt;relation&gt;
+      &lt;relationType&gt;BT&lt;/relationType&gt;
+      &lt;termId&gt;21&lt;/termId&gt;
+      &lt;termName&gt;Brachiosauridae&lt;/termName&gt;
+      &lt;termType&gt;PT&lt;/termType&gt;
+     &lt;/relation&gt;
  
        &lt;idzebra xmlns="http://www.indexdata.dk/zebra/"&gt;
-       &lt;size&gt;593&lt;/size&gt;
-       &lt;localnumber&gt;891&lt;/localnumber&gt;
-       &lt;filename&gt;records/genera.xml&lt;/filename&gt;
+       &lt;size&gt;300&lt;/size&gt;
+       &lt;localnumber&gt;23&lt;/localnumber&gt;
+       &lt;filename&gt;records/dino.xml&lt;/filename&gt;
        &lt;/idzebra&gt;
-    &lt;/GENUS&gt;
+    &lt;/Zthes&gt;
     </screen>
    </para>
    <para>
-   Now wasn't that easy?
+   Now wasn't that nice and easy?
    </para>
   </sect1>
  
+
   <sect1 id="example2">
-  <title>Example 2: Supporting Z39.50 Searches</title>
+  <title>Example 2: Supporting Interoperable Searches</title>
  
    <para>
-   You may have noticed as <literal>zebraidx</literal> was building
-   the database that it issued a warning, which we ignored at the
-   time:
-   <screen>
-    $ zebraidx update records
-    00:45:46-08/10: ../../index/zebraidx(5016) [warn] records/genera.xml:0 Couldn't open GENUS.abs [No such file or directory]
-   </screen>
-   <!-- FIXME ### This needs more text -->
+   The problem with the previous example is that you need to know the
+   structure of the documents in order to find them.  For example,
+   when we wanted to find the record for the taxon
+   <foreignphrase role="taxon">Sauroposeidon</foreignphrase>,
+   we had to formulate a complex XPath 
+   <literal>/Zthes/termName</literal>
+   which embodies the knowledge that taxon names are specified in a
+   <literal>&lt;termName&gt;</literal> element inside the top-level
+   <literal>&lt;Zthes&gt;</literal> element.
+  </para>
+  <para>
+   This is bad not just because it requires a lot of typing, but more
+   significantly because it ties searching semantics to the physical
+   structure of the searched records.  You can't use the same search
+   specification to search two databases if their internal
+   representations are different.  Consider a different taxonomy
+   database in which the records have taxon names specified
+   inside a <literal>&lt;name&gt;</literal> element nested within a
+   <literal>&lt;identification&gt;</literal> element
+   inside a top-level <literal>&lt;taxon&gt;</literal> element: then
+   you'd need to search for them using
+   <literal>1=/taxon/identification/name</literal>
+  </para>
+  <para>
+   How, then, can we build broadcasting Information Retrieval
+   applications that look for records in many different databases?
+   The Z39.50 protocol offers a powerful and general solution to this:
+   abstract ``access points''.  In the Z39.50 model, an access point
+   is simply a point at which searches can be directed.  Nothing is
+   said about implementation: in a given database, an access point
+   might be implemented as an index, a path into physical records, an
+   algorithm for interrogating relational tables or whatever works.
+   The only important thing is that the semantics of an access
+   point is fixed and well defined.
+  </para>
+  <para>
+   For convenience, access points are gathered into <firstterm>attribute
+   sets</firstterm>.  For example, the BIB-1 attribute set is supposed to
+   contain bibliographic access points such as author, title, subject
+   and ISBN; the GEO attribute set contains access points pertaining
+   to geospatial information (bounding coordinates, stratum, latitude
+   resolution, etc.); the CIMI
+   attribute set contains access points to do with museum collections
+   (provenance, inscriptions, etc.)
+  </para>
+  <para>
+   In practice, the BIB-1 attribute set has tended to be a dumping
+   ground for all sorts of access points, so that, for example, it
+   includes some geospatial access points as well as strictly
+   bibliographic ones.  Nevertheless, this model
+   allows a layer of abstraction over the physical representation of
+   records in databases.
    </para>
+  <para>
+   In the BIB-1 attribute set, a taxon name is probably best
+   interpreted as a title - that is, a phrase that identifies the item
+   in question.  BIB-1 represents title searches by
+   access point 4.  (See 
+   <ulink url="&url.z39.50.bib1.semantics;">The BIB-1 Attribute
+    Set Semantics</ulink>)
+   So we need to configure our dinosaur database so that searches for
+   BIB-1 access point 4 look in the 
+   <literal>&lt;termName&gt;</literal> element,
+   inside the top-level
+   <literal>&lt;Zthes&gt;</literal> element.
+  </para>
+  <para>
+   This is a two-step process.  First, we need to tell Zebra that we
+   want to support the BIB-1 attribute set.  Then we need to tell it
+   which elements of its record pertain to access point 4.
+   </para>
+   <para>
+   We need to create an <link linkend="abs-file">Abstract Syntax
+   file</link> named after the document element of the records we're
+    working with, plus a <literal>.abs</literal> suffix - in this case,
+    <literal>Zthes.abs</literal> - as follows:
+   </para>
+   <programlistingco>
+    <areaspec>
+     <area id="attset.zthes" coords="2"/>
+     <area id="attset.attset" coords="3"/>
+     <area id="termId" coords="7"/>
+     <area id="termName" coords="8"/>
+    </areaspec>
+    <programlisting>
+attset zthes.att
+attset bib1.att
+xpath enable
+systag sysno none
+
+xelm /Zthes/termId              termId:w
+xelm /Zthes/termName            termName:w,title:w
+xelm /Zthes/termQualifier       termQualifier:w
+xelm /Zthes/termType            termType:w
+xelm /Zthes/termLanguage        termLanguage:w
+xelm /Zthes/termNote            termNote:w
+xelm /Zthes/termCreatedDate     termCreatedDate:w
+xelm /Zthes/termCreatedBy       termCreatedBy:w
+xelm /Zthes/termModifiedDate    termModifiedDate:w
+xelm /Zthes/termModifiedBy      termModifiedBy:w
+    </programlisting>
+   <calloutlist>
+    <callout arearefs="attset.zthes">
+     <para>
+      Declare Thesausus attribute set. See <filename>zthes.att</filename>.
+     </para>
+    </callout>
+    <callout arearefs="attset.attset">
+     <para>
+      Declare Bib-1 attribute set. See <filename>bib1.att</filename> in
+      Zebra's <filename>tab</filename> directory.
+     </para>
+    </callout>
+    <callout arearefs="termId">
+     <para>
+      This xelm directive selects contents of nodes by XPath expression
+      <literal>/Zthes/termId</literal>. The contents (CDATA) will be
+      word searchable by Zthes attribute termId (value 1001).
+     </para>
+    </callout>
+    <callout arearefs="termName">
+     <para>
+      Make <literal>termName</literal> word searchable by both
+      Zthes attribute termName (1002) and Bib-1 atttribute title (4).
+     </para>
+    </callout>
+   </calloutlist>
+  </programlistingco>
+   <para>
+    After re-indexing, we can search the database using Bib-1
+    attribute, title, as follows:
+    <screen>
+Z> form xml
+Z> f @attr 1=4 Eoraptor
+Sent searchRequest.
+Received SearchResponse.
+Search was a success.
+Number of hits: 1, setno 1
+SearchResult-1: Eoraptor(1)
+records returned: 0
+Elapsed: 0.106896
+Z> s
+Sent presentRequest (1+1).
+Records: 1
+[Default]Record type: XML
+&lt;Zthes&gt;
+ &lt;termId&gt;2&lt;/termId&gt;
+ &lt;termName&gt;Eoraptor&lt;/termName&gt;
+ &lt;termType&gt;PT&lt;/termType&gt;
+ &lt;termNote&gt;The most basal known dinosaur&lt;/termNote&gt;
+ ...
+    </screen>
+   </para>
   </sect1>
  </chapter>
  
-<!--
-
-   <listitem>
-    <para>
-     The master configuration file, <literal>zebra.cfg</literal>,
-     which is as short and simple as it can be:
-     <screen>
-       # $Header: /home/cvsroot/idis/doc/examples.xml,v 1.8 2002-10-10 14:27:18 heikki Exp $
-       # Bare-bones master configuration file for Zebra
-       profilePath: .:../../tab:../../../yaz/tab
-     </screen>
-     Apart from the comments, which are ignored, all this specifies is
-     that the server should recognise the attribute set described in
-     the file called
-     <literal>bib1.att</literal>.
-     ### What is an attribute set?
-    </para>
-   </listitem>
-
-   <listitem>
-    <para>
-     The BIB-1 attribute set configuration file,
-     <literal>bib1.att</literal>, which is also as short as possible:
-     <screen>
-       # $Header: /home/cvsroot/idis/doc/examples.xml,v 1.8 2002-10-10 14:27:18 heikki Exp $
-       # Bare-bones BIB-1 attribute set file for Zebra
-       reference Bib-1
-     </screen>
-     Apart from the comments, all this specifies is that reference of
-     the attribute set described by this file is
-     <literal>Bib-1</literal>, a name recognised by the system as
-     referring to a well-known opaque identifier that is transmitted
-     by clients as part of their searches.
-     ### Yeuch!  Surely we can say that better!
-    </para>
-    <para>
-     ### Can't we somehow say this trivial thing in the main
-     configuration file?
-    </para>
-   </listitem>
--->
  
  <!--
         The simplest hello-world example could go like this:
@@ -248,7 +369,7 @@ How to include images:
           </caption>
         </mediaobject>
  
-Whene the three <*object> thingies inside the top-level <mediaobject>
+Where the three <*object> thingies inside the top-level <mediaobject>
  are decreasingly preferred version to include depending on what the
  rendering engine can handle.  I generated the EPS version of the image
  by exporting a line-drawing done in TGIF, then converted that to the