Update copyright year + FSF address

[idzebra-moved-to-github.git] / doc / querymodel.xml
diff --git a/doc/querymodel.xml b/doc/querymodel.xml

index ee17061..831eeff 100644 (file)
--- a/doc/querymodel.xml
+++ b/doc/querymodel.xml
@@ -1,10 +1,10 @@
   <chapter id="querymodel">
-  <!-- $Id: querymodel.xml,v 1.16 2006-06-25 21:54:03 marc Exp $ -->
+  <!-- $Id: querymodel.xml,v 1.23 2006-07-31 12:26:55 adam Exp $ -->
    <title>Query Model</title>
    
    <sect1 id="querymodel-overview">
     <title>Query Model Overview</title>  
-
+   
     <sect2 id="querymodel-query-languages">
      <title>Query Languages</title>
   
@@ -24,43 +24,42 @@
      <para>
       Since the <literal>type-1 (RPN)</literal> 
       query structure has no direct, useful string
-     representation, every origin application needs to provide some
+     representation, every client application needs to provide some
       form of mapping from a local query notation or representation to it.
-     </para>
-
-
-   <sect3 id="querymodel-query-languages-pqf">
-    <title>Prefix Query Format (PQF)</title>
-
-   <para>
-     Index Data has defined a textual representation in the 
-     <literal>Prefix Query Format</literal>, short
-     <literal>PQF</literal>, which maps 
-      <literal>one-to-one</literal> to binary encoded  
-      <literal>type-1 RPN</literal> query packages.
-      It has been adopted by other
-      parties developing Z39.50 software, and is often referred to as
-     <literal>Prefix Query Notation</literal>, or in short 
-     <literal>PQN</literal>. See       
-     <xref linkend="querymodel-pqf"/> for further explanations and
-     descriptions of Zebra's capabilities.  
      </para>
-   </sect3>    
-
-   <sect3 id="querymodel-query-languages-cql">
-    <title>Common Query Language (CQL)</title>
+    
+    
+    <sect3 id="querymodel-query-languages-pqf">
+     <title>Prefix Query Format (PQF)</title>
+     <para>
+      Index Data has defined a textual representation in the 
+      <ulink url="&url.yaz.pqf;">Prefix Query Format</ulink>, short
+      <emphasis>PQF</emphasis>, which maps 
+      one-to-one to binary encoded  
+      <emphasis>type-1 RPN</emphasis> queries.
+      PQF has been adopted by other
+      parties developing Z39.50 software, and is often referred to as
+      <literal>Prefix Query Notation</literal>, or in short 
+      <literal>PQN</literal>. See       
+      <xref linkend="querymodel-pqf"/> for further explanations and
+      descriptions of Zebra's capabilities.  
+     </para>
+    </sect3>    
+    
+    <sect3 id="querymodel-query-languages-cql">
+     <title>Common Query Language (CQL)</title>
       <para>
-      The query model of the   <literal>type-1 RPN</literal>,
-      expressed in <literal>PQF/PQN</literal> is natively supported. 
-      On the other hand, the default <literal>SRU</literal>
-      webservices <literal>Common Query Language</literal>
-     <ulink url="&url.cql;">CQL</ulink> is not natively supported.
+      The query model of the type-1 RPN,
+      expressed in PQF/PQN is natively supported. 
+      On the other hand, the default SRU
+      web services <emphasis>Common Query Language</emphasis>
+      <ulink url="&url.cql;">CQL</ulink> is not natively supported.
       </para>
       <para>
-     Zebra can be configured to understand and map CQL to PQF. See
-     <xref linkend="querymodel-cql-to-pqf"/>.
-    </para>
-   </sect3>    
+      Zebra can be configured to understand and map CQL to PQF. See
+      <xref linkend="querymodel-cql-to-pqf"/>.
+     </para>
+    </sect3>    
   
     </sect2>
  
@@ -85,7 +84,7 @@
        <literal>explain</literal> operation, which provides the means
        for learning which  
        <emphasis>fields</emphasis> (also called
-      <emphasis>indexes</emphasis> or <emphasis>access points</emphasis>
+      <emphasis>indexes</emphasis> or <emphasis>access points</emphasis>)
        are provided, which default parameter the server uses, which
        retrieve document formats are defined, and which specific parts
        of the general query model are supported.      
@@ -133,7 +132,7 @@
       <para>
        It provides
        the means to investigate the content of specific indexes.
-      Scanning an index returns a handful of terms actually fond in
+      Scanning an index returns a handful of terms actually found in
        the indexes, and in addition the <literal>scan</literal>
        operation returns the number of documents indexed by each term.
        A search client can use this information to propose proper
@@ -150,10 +149,11 @@
    <sect1 id="querymodel-pqf">
     <title>Prefix Query Format syntax and semantics</title>
     <para>
-    The <ulink url="&url.yaz.pqf;">PQF grammer</ulink>
+    The <ulink url="&url.yaz.pqf;">PQF grammar</ulink>
      is documented in the YAZ manual, and shall not be
      repeated here. This textual PQF representation
-    is always during search mapped to the equivalent Zebra internal
+    is not transmistted to Zebra during search, but it is in the
+    client mapped to the equivalent Z39.50 binary 
      query parse tree. 
     </para>
     
@@ -210,7 +210,7 @@
           <td><literal>bib-1</literal></td>
           <td>Standard PQF query language attribute set which defines the
            semantics of Z39.50 searching. In addition, all of the
-          non-use attributes (type 2-9) define the hard-wired 
+          non-use attributes (types 2-11) define the hard-wired 
            Zebra internal query
            processing.</td>
           <td>default</td>
@@ -227,7 +227,7 @@
           <td><literal>idxpath</literal></td>
           <td>Hardwired XPATH like attribute set, only available for
               indexing with the GRS record model</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
          -->
         </tbody>
@@ -251,7 +251,7 @@
      <sect3 id="querymodel-boolean-operators">
       <title>Boolean operators</title>
       <para>
-      A pair of subquery trees, or of atomic queries, is combined
+      A pair of sub query trees, or of atomic queries, is combined
        using the standard boolean operators into new query trees.
        Thus, boolean operators are always internal nodes in the query tree.
       </para>
@@ -281,7 +281,7 @@
           <td>Set complement of two atomic queries hit sets</td>
          </tr>
          <tr><td><literal>@prox</literal></td>
-         <td>binary <literal>PROXIMY</literal> operator</td>
+         <td>binary <literal>PROXIMITY</literal> operator</td>
           <td>Set intersection of two atomic queries hit sets. In 
            addition, the intersection set is purged for all 
            documents which do not satisfy the requested query 
@@ -331,7 +331,7 @@
        <emphasis>retrieval</emphasis>, in the same order and near each
        other as described in the term list.  
        The hit set is a subset of the corresponding
-      PROXIMY query.
+      PROXIMITY query.
        <screen>
         Z> find "information retrieval"
        </screen>
@@ -350,7 +350,7 @@
       </para>
       <para>
        Atomic (APT) queries are always leaf nodes in the PQF query tree. 
-      Unsupplied non-use attributes type 2-9 are either inherited from
+      UN-supplied non-use attributes types 2-11 are either inherited from
        higher nodes in the query tree, or are set to Zebra's default values.
        See <xref linkend="querymodel-bib1"/> for details. 
       </para>
@@ -415,7 +415,7 @@
        </para>
       
       <para>
-      For example, we migh want to scan the title index, starting with
+      For example, we might want to scan the title index, starting with
        the term 
        <emphasis>debussy</emphasis>, and displaying this and the
        following terms in lexicographic order:
@@ -446,7 +446,9 @@
       
       <para>
        Defining a named result set and re-using it in the next query,
-      using <literal>yaz-client</literal>. 
+      using <literal>yaz-client</literal>. Notice that the client, not
+      the server, assigns the string <literal>'1'</literal> to the
+      named result set. 
        <screen>
         Z> f @attr 1=4 mozart
         ...
@@ -455,11 +457,6 @@
         Z> f @and @set 1 @attr 1=4 amadeus
         ...
         Number of hits: 14, setno 2
-       ...
-       Z> f @attr 1=1016 beethoven
-       ...
-       Number of hits: 26, setno 3
-       ...
        </screen>
       </para>
       
@@ -589,7 +586,7 @@
        Filter the addressing XPath by a predicate working on exact
        string values in
        attributes (in the XML sense) can be done: return all those docs which
-      have the term "english" contained in one of all text subnodes of
+      have the term "english" contained in one of all text sub nodes of
        the subtree defined by the XPath
        <literal>/record/title[@lang='en']</literal>. And similar
        predicate filtering.
@@ -610,7 +607,8 @@
       </para>
       <para>
        Escaping PQF keywords and other non-parseable XPath constructs
-      with <literal>'{ }'</literal> to prevent syntax errors:
+      with <literal>'{ }'</literal> to prevent client-side PQF parsing
+      syntax errors:
        <screen>
         Z> find @attr {1=/root/first[@attr='danish']} content
         Z> find @attr {1=/record/@set} oai
@@ -788,13 +786,35 @@
       <filename>tab/dan1.att</filename>,
       <filename>tab/explain.att</filename>, and
       <filename>tab/gils.att</filename>.
+     </para>
+    <para>
+      For example, some few  <literal>Bib-1</literal> use
+      attributes from the  <filename>tab/bib1.att</filename> are:
+      <screen>
+       att 1               Personal-name
+       att 2               Corporate-name
+       att 3               Conference-name
+       att 4               Title
+       ...
+       att 1009            Subject-name-personal
+       att 1010            Body-of-text
+       att 1011            Date/time-added-to-db
+       ...
+       att 1016            Any
+       att 1017            Server-choice
+       att 1018            Publisher
+       ...
+       att 1035            Anywhere
+       att 1036            Author-Title-Subject
+      </screen>
+     </para>
+    <para>
       New attribute sets can be added by adding new 
       <filename>tab/*.att</filename> configuration files, which need to
-     be sourced in the main configuration <filename>zebra.cfg</filename>.
+     be sourced in the main configuration <filename>zebra.cfg</filename>. 
       </para>
-
      <para>
-     In addition, Zebra allows the access of 
+      In addition, Zebra allows the access of 
       <emphasis>internal index names</emphasis> and <emphasis>dynamic
       XPath</emphasis> as use attributes; see
        <xref linkend="querymodel-use-string"/> and 
@@ -997,7 +1017,7 @@
          <tr>
           <td>Any position in field</td>
           <td>3</td>
-         <td>default</td>
+         <td>supported</td>
          </tr>
         </tbody>
       </table>
@@ -1005,9 +1025,9 @@
      <para>
        The position attribute values <literal>first in field (1)</literal>,
        and <literal>first in subfield(2)</literal> are unsupported.
-      Using them does not trigger an error, but silent defaults to 
-      <literal>any position in field (3)</literal>.
-      <!-- It should -->
+      Using them silently maps to 
+      <literal>any position in field (3)</literal>. A proper diagnostic
+      should have been issued.
        </para>
      </sect3>
      
@@ -1352,7 +1372,7 @@
          <tr>
           <td>Complete subfield</td>
           <td>2</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
          <tr>
           <td>Complete field</td>
@@ -1538,9 +1558,21 @@
       </screen>
      </para>
      
+
+    <!--
+    Zebra Extension Term Set Attribute
+    From the manual text, I can not see what is the point with this feature.
+    I think it makes more sense when there are multiple terms in a query, or
+    something...
+    
+    We decided 2006-06-03 to disable this feature, as it is covered by
+    scan within a resultset. Better use ressources to upgrade this
+    feature for good performance.
+    -->
+
+    <!--
      <sect3 id="querymodel-zebra-attr-estimation">
       <title>Zebra Extension Term Set Attribute (type 8)</title>
-    </sect3>
      <para>
       The Term Set feature is a facility that allows a search to store
       hitting terms in a "pseudo" resultset; thus a search (as usual) +
@@ -1560,6 +1592,9 @@
       The model has one serious flaw: we don't know the size of term
       set. Experimental. Do not use in production code.
      </warning>
+    </sect3>
+    -->
+
  
      <sect3 id="querymodel-zebra-attr-weight">
       <title>Zebra Extension Rank Weight Attribute (type 9)</title>
@@ -1578,31 +1613,46 @@
      </para>
  
      <sect3 id="querymodel-zebra-attr-limit">
-     <title>Zebra Extension Approximative Limit Attribute (type 9)</title>
+     <title>Zebra Extension Approximative Limit Attribute (type 11)</title>
      </sect3>
      <para>
-     Newer Zebra versions normally estimate hit count for every APT
+     Zebra  computes - unless otherwise configured -
+     the exact hit count for every APT
       (leaf) in the query tree. These hit counts are returned as part of
       the searchResult-1 facility in the binary encoded Z39.50 search
       response packages.
      </para>
      <para>
-     By setting a limit for the APT we can make Zebra turn into
-     approximate hit count when a certain hit count limit is
-     reached. A value of zero means exact hit count.
+     By setting an estimation limit size of the resultset of the APT
+     leaves, Zebra stoppes processing the result set when the limit
+     length is reached.
+     Hit counts under this limit are still precise, but hit counts over it
+     are estimated using the statistics gathered from the chopped
+     result set.
+    </para>
+    <para>
+     Specifying a limit of <literal>0</literal> resuts in exact hit counts.
      </para>
      <para>
       For example, we might be interested in exact hit count for a, but
       for b we allow hit count estimates for 1000 and higher. 
       <screen>
-      Z> find @and a @attr 9=1000 b
+      Z> find @and a @attr 11=1000 b
       </screen>
      </para>
      <note>
       The estimated hit count facility makes searches faster, as one
       only needs to process large hit lists partially.
+     It is mostly used in huge databases, where you you want trade
+     exactness of hit counts against speed of execution. 
      </note>
      <warning>
+     Do not use approximative hit count limits
+     in conjunction with relevance ranking, as re-sorting of the
+     result set obviosly only works when the entire result set has
+     been processed. 
+    </warning>
+    <warning>
       This facility clashes with rank weight, because there all
       documents in the hit lists need to be examined for scoring and
       re-sorting.
@@ -1711,11 +1761,11 @@
      </warning>
  
      <sect3 id="querymodel-zebra-attr-approx">
-     <title>Zebra Extension Approximative Limit (type 9)</title>
+     <title>Zebra Extension Approximative Limit (type 11)</title>
      </sect3>
      <para>
       The <literal>Zebra Extension Approximative Limit (type
-      9)</literal> is a way to enable approximate
+      11)</literal> is a way to enable approximate
       hit counts for <literal>scan</literal> hit counts, in the same
       way as for <literal>search</literal> hit counts. 
      </para>
@@ -1748,7 +1798,7 @@
       main Zebra configuration file <filename>zebra.cfg</filename>
       directive <literal>attset: idxpath.att</literal> must be enabled.
      </para>
-    <warning>The <literal>idxpath</literal> is depreciated, may not be
+    <warning>The <literal>idxpath</literal> is deprecated, may not be
       supported in future Zebra versions, and should definitely
       not be used in production code.
      </warning>
@@ -1781,31 +1831,31 @@
           <td>XPATH Begin</td>
           <td>1</td>
           <td>_XPATH_BEGIN</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
          <tr>
           <td>XPATH End</td>
           <td>2</td>
           <td>_XPATH_END</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
          <tr>
           <td>XPATH CData</td>
           <td>1016</td>
           <td>_XPATH_CDATA</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
          <tr>
           <td>XPATH Attribute Name</td>
           <td>3</td>
           <td>_XPATH_ATTR_NAME</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
          <tr>
           <td>XPATH Attribute CData</td>
           <td>1015</td>
           <td>_XPATH_ATTR_CDATA</td>
-         <td>depreciated</td>
+         <td>deprecated</td>
          </tr>
         </tbody>
       </table>
@@ -2036,7 +2086,7 @@
       different types of registers or indexes, whose tokenization and
        character normalization rules differ. This reflects the fact that 
        searching fundamental different tokens like dates, numbers,
-      bitfields and string based text needs different rulesets. 
+      bitfields and string based text needs different rule sets. 
       </para>
  
       <table id="querymodel-zebra-mapping-structure-types"
@@ -2078,7 +2128,7 @@
          <td>urx (@attr 4=104)</td>
          <td>ignored</td>
          <td>URX/URL ('u')</td>
-        <td>Special index for URL web adresses</td>
+        <td>Special index for URL web addresses</td>
         </tr>
         <tr>
          <td>numeric (@attr 4=109)</td>
@@ -2326,6 +2376,8 @@
       The next plus character marks the end of the section.
       Currently Zebra only supports one specifier, the error tolerance,
       which consists one digit. 
+     <!-- TODO Nice thing, but what does
+     that error tolerance digit *mean*? Maybe an example would be nice? -->
      </para>
  
      <para>