Rolling mods to Marc's new ranking prose. (Check in early, check in

[idzebra-moved-to-github.git] / doc / administration.xml
diff --git a/doc/administration.xml b/doc/administration.xml

index be92e8e..1dd6a22 100644 (file)
--- a/doc/administration.xml
+++ b/doc/administration.xml
@@ -1,9 +1,9 @@
  <chapter id="administration">
- <!-- $Id: administration.xml,v 1.13 2002-12-02 15:10:58 mike Exp $ -->
+ <!-- $Id: administration.xml,v 1.32 2006-05-02 12:23:02 mike Exp $ -->
   <title>Administrating Zebra</title>
   <!-- ### It's a bit daft that this chapter (which describes half of
            the configuration-file formats) is separated from
-          "recordmodel.xml" (which describes the other half) by the
+          "recordmodel-grs.xml" (which describes the other half) by the
            instructions on running zebraidx and zebrasrv.  Some careful
            re-ordering is required here.
   -->
@@ -305,6 +305,19 @@
         Specifies <replaceable>size</replaceable> of internal memory
         to use for the zebraidx program.
         The amount is given in megabytes - default is 4 (4 MB).
+       The more memory, the faster large updates happen, up to about
+       half the free memory available on the computer.
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term>tempfiles: <replaceable>Yes/Auto/No</replaceable></term>
+     <listitem>
+      <para>
+       Tells zebra if it should use temporary files when indexing. The
+       default is Auto, in which case zebra uses temporary files only
+       if it would need more that <replaceable>memMax</replaceable> 
+       megabytes of memory. This should be good for most uses.
        </para>
       </listitem>
      </varlistentry>
@@ -323,23 +336,61 @@
      </varlistentry>
  
      <varlistentry>
-     <term>tagsysno: 0|1</term>
+     <term>passwd: <replaceable>file</replaceable></term>
+     <listitem>
+      <para>
+       Specifies a file with description of user accounts for Zebra.
+       The format is similar to that known to Apache's htpasswd files
+       and UNIX' passwd files. Non-empty lines not beginning with
+       # are considered account lines. There is one account per-line.
+       A line consists of fields separate by a single colon character.
+       First field is username, second is password.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term>passwd.c: <replaceable>file</replaceable></term>
+     <listitem>
+      <para>
+       Specifies a file with description of user accounts for Zebra.
+       File format is similar to that used by the passwd directive except
+       that the password are encrypted. Use Apache's htpasswd or similar
+       for maintenanace.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term>perm.<replaceable>user</replaceable>:
+     <replaceable>permstring</replaceable></term>
       <listitem>
        <para>
-       Species whether Zebra should include system-number data in XML
-       and GRS-1 records returned to clients, represented by the
-       <literal>&lt;localControlNumber&gt;</literal> element in XML
-       and the <literal>(1,14)</literal> tag in GRS-1.
-       The content of these elements is an internally-generated
-       integer uniquely identifying the record within its database.
-       It is included by default but may be turned off, with
-       <literal>tagsysno: 0</literal> for databases in which a local
-       control number is explicitly specified in the input records
-       themselves.
+       Specifies permissions (priviledge) for a user that are allowed
+       to access Zebra via the passwd system. There are two kinds
+       of permissions currently: read (r) and write(w). By default
+       users not listed in a permission directive are given the read
+       priviledge. To specify permissions for a user with no
+       username, or Z39.50 anonymous style use
+       <literal>anonymous</literal>. The permstring consists of
+       a sequence of characters. Include character <literal>w</literal>
+       for write/update access, <literal>r</literal> for read access.
        </para>
       </listitem>
      </varlistentry>
  
+    <varlistentry>
+      <term>dbaccess <replaceable>accessfile</replaceable></term>
+      <listitem>
+        <para>
+         Names a file which lists database subscriptions for individual users.
+         The access file should consists of lines of the form <literal>username:
+         dbnames</literal>, where dbnames is a list of database names, seprated by
+         '+'. No whitespace is allowed in the database list.
+       </para>
+      </listitem>
+    </varlistentry>
+
     </variablelist>
    </para>
    
@@ -402,7 +453,7 @@
    <para>
     
     <screen>
-    profilePath: /usr/local/yaz
+    profilePath: /usr/local/idzebra/tab
      attset: bib1.att
      simple.recordType: text
      simple.database: textbase
@@ -618,7 +669,7 @@
    </para>
    
    <para>
-   (see <xref linkend="data-model"/>
+   (see <xref linkend="record-model-grs"/>
      for details of how the mapping between elements of your records and
      searchable attributes is established).
    </para>
@@ -792,7 +843,6 @@
      
      <screen>
       register: /d1:500M
-     
       shadow: /scratch1:100M /scratch2:200M
      </screen>
      
@@ -870,8 +920,526 @@
    </sect2>
    
   </sect1>
+
+
+ <sect1 id="administration-ranking">
+  <title>Relevance Ranking and Sorting of Result Sets</title>
+
+  <sect2>
+   <title>Overview</title>
+   <para>
+    The default ordering of a result set is left up to the server,
+    which inside Zebra means sorting in ascending document ID order. 
+    This is not always the order humans want to browse the sometimes
+    quite large hit sets. Ranking and sorting comes to the rescue.
+   </para>
+
+   <para> 
+    In cases where a good presentation ordering can be computed at
+    indexing time, we can use a fixed <literal>static ranking</literal>
+    scheme, which is provided for the <literal>alvis</literal>
+    indexing filter. This defines a fixed ordering of hit lists,
+    independently of the query issued. 
+   </para>
+
+   <para>
+    There are cases, however, where relevance of hit set documents is
+    highly dependent on the query processed.
+    Simply put, <literal>dynamic relevance ranking</literal> 
+    sorts a set of retrieved 
+    records such
+    that those most likely to be relevant to your request are
+    retrieved first. 
+    Internally, Zebra retrieves all documents that satisfy your
+    query, and re-orders the hit list to arrange them based on
+    a measurement of similarity between your query and the content of
+    each record. 
+   </para>
+
+   <para>
+    Finally, there are situations where hit sets of documents should be
+    <literal>sorted</literal> during query time according to the
+    lexicographical ordering of certain sort indexes created at
+    indexing time.
+   </para>
+  </sect2>
+
+
+ <sect2 id="administration-ranking-static">
+  <title>Static Ranking</title>
+  
+   <para>
+    Zebra uses internally inverted indexes to look up term occurencies
+    in documents. Multiple queries from different indexes can be
+    combined by the binary boolean operations <literal>AND</literal>, 
+    <literal>OR</literal> and/or <literal>NOT</literal> (which
+    is in fact a binary <literal>AND NOT</literal> operation). 
+    To ensure fast query execution
+    speed, all indexes have to be sorted in the same order.
+   </para>
+   <para>
+    The indexes are normally sorted according to document 
+    <literal>ID</literal> in
+    ascending order, and any query which does not invoke a special
+    re-ranking function will therefore retrieve the result set in
+    document 
+    <literal>ID</literal>
+    order.
+   </para>
+   <para>
+    If one defines the 
+    <screen>
+    staticrank: 1 
+    </screen> 
+    directive in the main core Zebra config file, the internal document
+    keys used for ordering are augmented by a preceeding integer, which
+    contains the static rank of a given document, and the index lists
+    are ordered 
+    first by ascending static rank,
+    then by ascending document <literal>ID</literal>.
+    Zero
+    is the ``best'' rank, as it occurs at the
+    beginning of the list; higher numbers represent worse scores.
+   </para>
+   <para>
+    The experimental <literal>alvis</literal> filter provides a
+    directive to fetch static rank information out of the indexed XML
+    records, thus making <emphasis>all</emphasis> hit sets orderd
+    after <emphasis>ascending</emphasis> static
+    rank, and for those doc's which have the same static rank, ordered
+    after <emphasis>ascending</emphasis> doc <literal>ID</literal>.
+    See <xref linkend="record-model-alvisxslt"/> for the gory details.
+   </para>
+    </sect2>
+
+
+ <sect2 id="administration-ranking-dynamic">
+  <title>Dynamic Ranking</title>
+   <para>
+    In order to fiddle with the static rank order, it is necessary to
+    invoke additional re-ranking/re-ordering using dynamic
+    ranking or score functions. These functions return positive
+    integer scores, where <emphasis>highest</emphasis> score is 
+    ``best'';
+    hit sets are sorted according to
+    <emphasis>decending</emphasis> 
+    scores (in contrary
+    to the index lists which are sorted according to
+    ascending rank number and document ID).
+   </para>
+   <para>
+    Dynamic ranking is enabled by a directive like one of the
+    following in the zebra config file (use only one of these a time!):
+    <screen> 
+    rank: rank-1        # default TDF-IDF like
+    rank: rank-static   # dummy do-nothing
+    rank: zvrank        # configurable, experimental TDF-IDF like
+    </screen>
+    Notice that the <literal>rank-1</literal> and
+    <literal>zvrank</literal> do not use the static rank 
+    information in the list keys, and will produce the same ordering
+    with or without static ranking enabled.
+   </para>
+   <para>
+    The dummy <literal>rank-static</literal> reranking/scoring
+    function returns just 
+    <literal>score = max int - staticrank</literal>
+    in order to preserve the static ordering of hit sets that would
+    have been produced had it not been invoked.
+    Obviously, to combine static and dynamic ranking usefully,
+    it is necessary
+    to make a new ranking 
+    function; this is left
+    as an exercise for the reader. 
+   </para>
+
+
+   <para>
+    Dynamic ranking is done at query time rather than
+    indexing time (this is why we
+    call it ``dynamic ranking'' in the first place ...)
+    It is invoked by adding
+    the Bib-1 relation attribute with
+    value ``relevance'' to the PQF query (that is,
+    <literal>@attr&nbsp;2=102</literal>, see also  
+    <ulink url="ftp://ftp.loc.gov/pub/z3950/defs/bib1.txt">
+     The BIB-1 Attribute Set Semantics</ulink>). 
+    To find all articles with the word <literal>Eoraptor</literal> in
+    the title, and present them relevance ranked, issue the PQF query:
+    <screen>
+     @attr 2=102 @attr 1=4 Eoraptor
+    </screen>
+   </para>
+ 
+   <para>
+     The default <literal>rank-1</literal> ranking module implements a 
+     TF-IDF (Term Frequecy over Inverse Document Frequency) like algorithm.
+   </para>
+
+   <warning>
+     <para>
+      Notice that <literal>dynamic ranking</literal> is not compatible
+      with <literal>estimated hit sizes</literal>, as all documents in
+      a hit set must be acessed to compute the correct placing in a
+      ranking sorted list. Therefore the use attribute setting
+      <literal>@attr&nbsp;2=102</literal> clashes with 
+      <literal>@attr&nbsp;9=integer</literal>. 
+     </para>
+   </warning>  
+
+   <para>
+     It is possible to apply dynamic ranking on parts of the PQF query
+     allone:
+     <screen>
+     Z> f @and @attr 2=102 @attr 1=1010 Utah @attr 1=1018 Springer
+     </screen>
+     searches for all documents which have the term 'Utah' on the
+     body of text, and which have the term 'Springer' in the publisher
+     field, and sort them in the order of the relvance ranking made on
+     the body of text index only. 
+   </para>
+    <para>
+     Rank weight is a way to pass a value to a ranking algorithm - so that 
+     one APT has one value - while another as a different one. For
+     example, we can 
+     search for 'utah' in use attribute set 'title' with weight 30, as
+     well as in use attribute set 'any' with weight 20.
+     <screen>
+     Z> f @attr 2=102 @or @attr 9=30 @attr 1=4 utah @attr 9=20 utah
+     </screen>
+    </para>
+    <warning>
+     <para>
+      The rank weight feature is experimental. It may change in future
+      releases of zebra, and is not production mature. 
+     </para>
+    </warning>
+    
+   <para>
+     Notice that <literal>dynamic ranking</literal> can be enabled in
+     sever side CQL query expansion by adding <literal>@attr
+     2=102</literal> to the CQL config file. For example
+     <screen>
+      relationModifier.relevant                = 2=102
+     </screen>
+     invokes dynamik ranking each time a CQL query of the form 
+    <screen>
+     Z> querytype cql
+     Z> f alvis.text =/relevant house
+    </screen>
+     is issued. Dynamic ranking can be enabled on specific CQL indexes
+     by (for example) setting
+     <screen>
+      index.alvis.text                        = 1=text 2=102
+     </screen>
+     which then invokes dynamik ranking each time a CQL query of the form 
+    <screen>
+     Z> querytype cql
+     Z> f alvis.text = house
+    </screen>
+     is issued.
+   </para>
+
+    </sect2>
+
+
+ <sect2 id="administration-ranking-sorting">
+  <title>Sorting</title>
+   <para>
+     Sorting is enabled in the configuration of record indexing. For
+     example, to enable sorting according to the BIB-1
+     <literal>Date/time-added-to-db</literal> field, one could add the line
+     <screen>
+        xelm /*/@created               Date/time-added-to-db:s
+     </screen>
+     to any <literal>.abs</literal> record indexing config file, or
+     similarily, one could add an indexing element of the form
+     <screen><![CDATA[       
+      <z:index name="date-modified" type="s">
+       <xsl:value-of select="some/xpath"/>
+      </z:index>
+      ]]></screen>
+     to any <literal>alvis</literal> indexing rule.
+     </para>
+     <para>
+     To trigger a sorting on a pre-defined sorting index of type
+     <literal>s</literal>, we can issue a sort with BIB-1
+     embedded sort attribute set <literal>7</literal>. 
+     The embedded sort is a way to specify sort within a query - thus
+     removing the need to send a Z39.50 <literal>Sort
+     Request</literal> separately. 
+     </para>
+     <para>
+     The value after attribute type <literal>7</literal> is 
+     <literal>1</literal> (=ascending), or <literal>2</literal>
+     (=descending). 
+     The attributes+term (APT) node is separate from the rest of the
+     PQF query, and must be <literal>@or</literal>'ed. 
+     The term associated with this attribute is the sorting level,
+     where
+     <literal>0</literal> specifies the primary sort key,
+     <literal>1</literal> the secondary sort key, and so on. 
+     </para>
+    <para>For example, a search for water, sort by title (ascending),
+    is expressed by the PQF query
+     <screen>
+     Z> f @or @attr 1=1016 water @attr 7=1 @attr 1=4 0
+     </screen>
+      whereas a search for water, sort by title ascending, 
+     then date descending would be
+     <screen>
+     Z> f @or @or @attr 1=1016 water @attr 7=1 @attr 1=4 0 @attr 7=2 @attr 1=30 1
+     </screen>
+    </para>
+    <para>
+     Notice the fundamental differences between <literal>dynamic
+     ranking</literal> and <literal>sorting</literal>: there can only
+     be one ranking function defined and configured, but there can be
+     specified multiple sorting indexes dynamically at search
+     time. Ranking does not need to use specific indexes, which means,
+     dynamic ranking can be enabled and disabled without
+     re-indexing. On the other hand, sorting indexes need to be
+     defined before indexing.
+     </para>
+
+ </sect2>
+
+
+ </sect1>
+
+ <sect1 id="administration-extended-services">
+  <title>Extended Services: Remote Insert, Update and Delete</title>
+  
+  <para>
+    The extended services are not enabled by default in zebra - due to the
+    fact that they modify the system.
+    In order to allow anybody to update, use
+    <screen>
+    perm.anonymous: rw
+    </screen>
+    in the main zebra configuration file <filename>zebra.cfg</filename>.
+    Or, even better, allow only updates for a particular admin user. For
+    user <literal>admin</literal>, you could use:
+    <screen>
+     perm.admin: rw
+     passwd: passwordfile
+    </screen>
+    And in <filename>passwordfile</filename>, specify users and
+    passwords as colon seperated strings:
+    <screen> 
+     admin:secret
+    </screen> 
+   </para>
+   <para>
+    We can now start a yaz-client admin session and create a database:
+   <screen>
+    <![CDATA[
+     $ yaz-client localhost:9999 -u admin/secret
+     Z> adm-create
+     ]]>
+   </screen>
+    Now the <literal>Default</literal> database was created,
+    we can insert an XML file (esdd0006.grs
+    from example/gils/records) and index it:
+   <screen>  
+    <![CDATA[
+     Z> update insert 1 esdd0006.grs
+     ]]>
+   </screen>
+    The 3rd parameter - <literal>1</literal> here -
+      is the opaque record ID from <literal>Ext update</literal>.
+      It a record ID that <emphasis>we</emphasis> assign to the record
+    in question. If we do not 
+    assign one, the usual rules for match apply (recordId: from zebra.cfg).
+   </para>
+   <para>
+    Actually, we should have a way to specify "no opaque record id" for
+    yaz-client's update command.. We'll fix that.
+   </para>
+   <para>
+    The newly inserted record can be searched as usual:
+    <screen>
+    <![CDATA[
+     Z> f utah
+     Sent searchRequest.
+     Received SearchResponse.
+     Search was a success.
+     Number of hits: 1, setno 1
+     SearchResult-1: term=utah cnt=1
+     records returned: 0
+     Elapsed: 0.014179
+     ]]>
+    </screen>
+   </para>
+   <para>
+    Let's delete the beast:
+    <screen>
+    <![CDATA[
+     Z> update delete 1
+     No last record (update ignored)
+     Z> update delete 1 esdd0006.grs
+     Got extended services response
+     Status: done
+     Elapsed: 0.072441
+     Z> f utah
+     Sent searchRequest.
+     Received SearchResponse.
+     Search was a success.
+     Number of hits: 0, setno 2
+     SearchResult-1: term=utah cnt=0
+     records returned: 0
+     Elapsed: 0.013610
+     ]]>
+     </screen>
+    </para>
+    <para>
+    If shadow register is enabled in your
+    <filename>zebra.cfg</filename>,
+    you must run the adm-commit command
+    <screen>
+    <![CDATA[
+     Z> adm-commit
+     ]]>
+    </screen>
+     after each update session in order write your changes from the
+     shadow to the life register space.
+   </para>
+   <para>
+    Extended services are also available from the YAZ client layer. An
+    example of an YAZ-PHP extended service transaction is given here:
+    <screen>
+    <![CDATA[
+     $record = '<record><title>A fine specimen of a record</title></record>';
+
+     $options = array('action' => 'recordInsert',
+                      'syntax' => 'xml',
+                      'record' => $record,
+                      'databaseName' => 'mydatabase'
+                     );
+
+     yaz_es($yaz, 'update', $options);
+     yaz_es($yaz, 'commit', array());
+     yaz_wait();
+
+     if ($error = yaz_error($yaz))
+       echo "$error";
+     ]]>
+    </screen>  
+    The <literal>action</literal> parameter can be any of 
+    <literal>recordInsert</literal> (will fail if the record already exists),
+    <literal>recordReplace</literal> (will fail if the record does not exist),
+    <literal>recordDelete</literal> (will fail if the record does not
+       exist), and
+    <literal>specialUpdate</literal> (will insert or update the record
+       as needed).
+   </para>
+   <para>
+    If a record is inserted
+    using the action  <literal>recordInsert</literal> 
+    one can specify the optional
+    <literal>recordIdOpaque</literal> parameter, which is a
+    client-supplied, opaque record identifier. This identifier will
+    replace zebra's own automagic identifier generation.  
+   </para>
+   <para>
+    When using the action <literal>recordReplace</literal> or
+    <literal>recordDelete</literal>, one must specify the additional 
+    <literal>recordIdNumber</literal> parameter, which must be an
+    existing Zebra internal system ID number. When retrieving existing
+    records, the ID number is returned in the field
+    <literal>/*/id:idzebra/localnumber</literal> in the namespace
+    <literal>xmlns:id="http://www.indexdata.dk/zebra/"</literal>,
+    where it can be picked up for later record updates or deletes. 
+   </para>
+ </sect1>
+
+
+  <sect1 id="gfs-config">
+   <title>YAZ Frontend Virtual Hosts</title>
+    <para>
+     <command>zebrasrv</command> uses the YAZ server frontend and does
+     support multiple virtual servers behind multiple listening sockets.
+    </para>
+    &zebrasrv-virtual;
+ 
+   <para>
+    Section "Virtual Hosts" in the YAZ manual.
+    <filename>http://www.indexdata.dk/yaz/doc/server.vhosts.tkl</filename>
+   </para>
+ </sect1>
+
+
+  <sect1 id="administration-cql-to-pqf">
+   <title>Server Side CQL to PQF Query Translation</title>
+   <para>
+    Using the
+    <literal>&lt;cql2rpn&gt;l2rpn.txt&lt;/cql2rpn&gt;</literal>
+      YAZ Frontend Virtual
+    Hosts option, one can configure
+    the YAZ Frontend CQL-to-PQF
+    converter, specifying the interpretation of various 
+    <ulink url="http://www.loc.gov/standards/sru/cql/">CQL</ulink>
+    indexes, relations, etc. in terms of Type-1 query attributes.
+    <!-- The  yaz-client config file -->  
+   </para>
+   <para>
+    For example, using server-side CQL-to-PQF conversion, one might
+    query a zebra server like this:
+    <screen>
+    <![CDATA[
+     yaz-client localhost:9999
+     Z> querytype cql
+     Z> find text=(plant and soil)
+     ]]>
+    </screen>
+     and - if properly configured - even static relevance ranking can
+     be performed using CQL query syntax:
+    <screen>
+    <![CDATA[
+     Z> find text = /relevant (plant and soil)
+     ]]>
+     </screen>
+   </para>
+
+   <para>
+    By the way, the same configuration can be used to 
+    search using client-side CQL-to-PQF conversion:
+    (the only difference is <literal>querytype cql2rpn</literal> 
+    instead of 
+    <literal>querytype cql</literal>, and the call specifying a local
+    conversion file)
+    <screen>
+    <![CDATA[
+     yaz-client -q local/cql2pqf.txt localhost:9999
+     Z> querytype cql2rpn
+     Z> find text=(plant and soil)
+     ]]>
+     </screen>
+   </para>
+
+   <para>
+    Exhaustive information can be found in the
+    Section "Specification of CQL to RPN mappings" in the YAZ manual.
+    <ulink url="http://www.indexdata.dk/yaz/doc/tools.tkl#tools.cql.map">
+     http://www.indexdata.dk/yaz/doc/tools.tkl#tools.cql.map</ulink>,
+   and shall therefore not be repeated here.
+   </para> 
+  <!-- 
+  <para>
+    See 
+      <ulink url="http://www.loc.gov/z3950/agency/zing/cql/dc-indexes.html">
+      http://www.loc.gov/z3950/agency/zing/cql/dc-indexes.html</ulink>
+    for the Maintenance Agency's work-in-progress mapping of Dublin Core
+    indexes to Attribute Architecture (util, XD and BIB-2)
+    attributes.
+   </para>
+   -->
+ </sect1>
+
+
   
  </chapter>
+
   <!-- Keep this comment at the end of the file
   Local variables:
   mode: sgml