Change IDs for the special retrieval stuff.

[idzebra-moved-to-github.git] / doc / architecture.xml
diff --git a/doc/architecture.xml b/doc/architecture.xml

index 60281e3..b6fe7cf 100644 (file)
--- a/doc/architecture.xml
+++ b/doc/architecture.xml
@@ -1,11 +1,10 @@
   <chapter id="architecture">
-  <!-- $Id: architecture.xml,v 1.8 2006-04-25 12:26:26 marc Exp $ -->
+  <!-- $Id: architecture.xml,v 1.16 2006-11-30 10:29:23 adam Exp $ -->
    <title>Overview of Zebra Architecture</title>
-  
  
-  <sect1 id="architecture-representation">
+  <section id="architecture-representation">
     <title>Local Representation</title>
-
+   
     <para>
      As mentioned earlier, Zebra places few restrictions on the type of
      data that you can index and manage. Generally, whatever the form of
@@ -30,9 +29,9 @@
      "grs" keyword, separated by "." characters.
      -->
     </para>
-  </sect1>
+  </section>
  
-  <sect1 id="architecture-maincomponents">
+  <section id="architecture-maincomponents">
     <title>Main Components</title>
     <para>
      The Zebra system is designed to support a wide range of data management
@@ -52,13 +51,13 @@
      same main components, which are presented here.
     </para>    
     <para>    
-    The virtual Debian package <literal>idzebra1.4</literal>
+    The virtual Debian package <literal>idzebra-2.0</literal>
      installs all the necessary packages to start
      working with Zebra - including utility programs, development libraries,
      documentation and modules. 
    </para>    
     
-   <sect2 id="componentcore">
+   <section id="componentcore">
      <title>Core Zebra Libraries Containing Common Functionality</title>
      <para>
       The core Zebra module is the meat of the <command>zebraidx</command>
@@ -122,17 +121,17 @@
       </variablelist>
       </para>
      <para> 
-     The Debian package <literal>libidzebra1.4</literal> 
+     The Debian package <literal>libidzebra-2.0</literal> 
       contains all run-time libraries for Zebra, the 
       documentation in PDF and HTML is found in 
-     <literal>idzebra1.4-doc</literal>, and
-     <literal>idzebra1.4-common</literal>
+     <literal>idzebra-2.0-doc</literal>, and
+     <literal>idzebra-2.0-common</literal>
       includes common essential Zebra configuration files.
      </para>
-   </sect2>
+   </section>
     
  
-   <sect2 id="componentindexer">
+   <section id="componentindexer">
      <title>Zebra Indexer</title>
      <para>
       The  <command>zebraidx</command>
@@ -142,12 +141,12 @@
       indexes according to the rules defined in the filter modules.
      </para>    
      <para>    
-     The Debian  package <literal>idzebra1.4-utils</literal> contains
+     The Debian  package <literal>idzebra-2.0-utils</literal> contains
       the  <command>zebraidx</command> utility.
      </para>
-   </sect2>
+   </section>
  
-   <sect2 id="componentsearcher">
+   <section id="componentsearcher">
      <title>Zebra Searcher/Retriever</title>
      <para>
       This is the executable which runs the Z39.50/SRU/SRW server and
@@ -155,12 +154,12 @@
       great Information Retrieval server application. 
      </para>    
      <para>    
-     The Debian  package <literal>idzebra1.4-utils</literal> contains
+     The Debian  package <literal>idzebra-2.0-utils</literal> contains
       the  <command>zebrasrv</command> utility.
      </para>
-   </sect2>
+   </section>
  
-   <sect2 id="componentyazserver">
+   <section id="componentyazserver">
      <title>YAZ Server Frontend</title>
      <para>
       The YAZ server frontend is 
@@ -171,28 +170,28 @@
      <para>
       In addition to Z39.50 requests, the YAZ server frontend acts
       as HTTP server, honoring
-      <ulink url="http://www.loc.gov/standards/sru/srw/">SRW</ulink> 
-     SOAP requests, and  
-     <ulink url="http://www.loc.gov/standards/sru/">SRU</ulink> 
-     REST requests. Moreover, it can
+      <ulink url="&url.srw;">SRU SOAP</ulink> 
+     requests, and  
+     <ulink url="&url.sru;">SRU REST</ulink> 
+     requests. Moreover, it can
       translate incoming 
-     <ulink url="http://www.loc.gov/standards/sru/cql/">CQL</ulink>
+     <ulink url="&url.cql;">CQL</ulink>
       queries to
-     <ulink url="http://indexdata.com/yaz/doc/tools.tkl#PQF">PQF</ulink>
+     <ulink url="&url.yaz.pqf;">PQF</ulink>
        queries, if
       correctly configured. 
      </para>
      <para>
-     <ulink url="http://www.indexdata.com/yaz">YAZ</ulink>
+     <ulink url="&url.yaz;">YAZ</ulink>
       is an Open Source  
       toolkit that allows you to develop software using the
       ANSI Z39.50/ISO23950 standard for information retrieval.
       It is packaged in the Debian packages     
       <literal>yaz</literal> and <literal>libyaz</literal>.
      </para>
-   </sect2>
+   </section>
     
-   <sect2 id="componentmodules">
+   <section id="componentmodules">
      <title>Record Models and Filter Modules</title>
      <para>
       The hard work of knowing <emphasis>what</emphasis> to index, 
@@ -204,25 +203,23 @@
       </para>
       <para>
       The virtual Debian package
-     <literal>libidzebra1.4-modules</literal> installs all base filter
+     <literal>libidzebra-2.0-modules</literal> installs all base filter
       modules. 
      </para>
  
-   <sect3 id="componentmodulestext">
+   
+   <section id="componentmodulestext">
      <title>TEXT Record Model and Filter Module</title>
      <para>
        Plain ASCII text filter. TODO: add information here.
-     <!--
-     <literal>text module missing as deb file<literal>
-     -->
      </para>
-   </sect3>
+   </section>
  
-   <sect3 id="componentmodulesgrs">
+   <section id="componentmodulesgrs">
      <title>GRS Record Model and Filter Modules</title>
      <para>
      The GRS filter modules described in 
-    <xref linkend="record-model-grs"/>
+    <xref linkend="grs"/>
      are all based on the Z39.50 specifications, and it is absolutely
      mandatory to have the reference pages on BIB-1 attribute sets on
      you hand when configuring GRS filters. The GRS filters come in
@@ -231,20 +228,12 @@
      to the <filename>*.abs</filename> configuration file suffix.
      </para>
      <para>
-     The <emphasis>grs.danbib</emphasis> filter is developed for 
-      DBC DanBib records.
-      DanBib is the Danish Union Catalogue hosted by DBC
-      (Danish Bibliographic Center). This filter is found in the
-      Debian package
-     <literal>libidzebra1.4-mod-grs-danbib</literal>.
-    </para>
-    <para>
        The <emphasis>grs.marc</emphasis> and 
        <emphasis>grs.marcxml</emphasis> filters are suited to parse and
        index binary and XML versions of traditional library MARC records 
        based on the ISO2709 standard. The Debian package for both
        filters is 
-     <literal>libidzebra1.4-mod-grs-marc</literal>.
+     <literal>libidzebra-2.0-mod-grs-marc</literal>.
      </para>
      <para>
        GRS TCL scriptable filters for extensive user configuration come
@@ -253,26 +242,26 @@
       a general scriptable TCL filter called 
       <emphasis>grs.tcl</emphasis>        
       are both included in the 
-     <literal>libidzebra1.4-mod-grs-regx</literal> Debian package.
+     <literal>libidzebra-2.0-mod-grs-regx</literal> Debian package.
      </para>
      <para>
        A general purpose SGML filter is called
       <emphasis>grs.sgml</emphasis>. This filter is not yet packaged,
       but planned to be in the  
-     <literal>libidzebra1.4-mod-grs-sgml</literal> Debian package.
+     <literal>libidzebra-2.0-mod-grs-sgml</literal> Debian package.
      </para>
      <para>
        The Debian  package 
-      <literal>libidzebra1.4-mod-grs-xml</literal> includes the 
+      <literal>libidzebra-2.0-mod-grs-xml</literal> includes the 
        <emphasis>grs.xml</emphasis> filter which uses <ulink
-      url="http://expat.sourceforge.net/">Expat</ulink> to 
+      url="&url.expat;">Expat</ulink> to 
        parse records in XML and turn them into IDZebra's internal GRS node
        trees. Have also a look at the Alvis XML/XSLT filter described in
        the next session.
      </para>
-   </sect3>
+   </section>
  
-   <sect3 id="componentmodulesalvis">
+   <section id="componentmodulesalvis">
      <title>ALVIS Record Model and Filter Module</title>
       <para>
        The Alvis filter for XML files is an XSLT based input
@@ -309,27 +298,26 @@
        <xref linkend="record-model-alvisxslt"/>.
        </para>
       <para>
-      The Debian package <literal>libidzebra1.4-mod-alvis</literal>
+      The Debian package <literal>libidzebra-2.0-mod-alvis</literal>
        contains the Alvis filter module.
       </para>
-    </sect3>
+    </section>
  
-   <sect3 id="componentmodulessafari">
+    <!--
+   <section id="componentmodulessafari">
      <title>SAFARI Record Model and Filter Module</title>
      <para>
       SAFARI filter module TODO: add information here.
-     <!--
-     <literal>safari module missing as deb file<literal>
-     -->
      </para>
-   </sect3>
+   </section>
+    -->
  
-   </sect2>
+   </section>
  
-  </sect1>
+  </section>
  
  
-  <sect1 id="architecture-workflow">
+  <section id="architecture-workflow">
     <title>Indexing and Retrieval Workflow</title>
  
    <para>
@@ -379,111 +367,160 @@
     </itemizedlist>
  
    </para>
-  </sect1>
+  </section>
  
-
-<!--
-  <sect1 id="architecture-querylanguage">
-   <title>Query Languages</title>
-   
+  <section id="special-retrieval">
+   <title>Retrieval of Zebra internal record data</title>
     <para>
-
-http://www.loc.gov/z3950/agency/document.html
-
-    PQF and BIB-1 stuff to be explained
-    <ulink url="http://www.loc.gov/z3950/agency/defns/bib1.html">
-     http://www.loc.gov/z3950/agency/defns/bib1.html</ulink> 
-
-     <ulink url="http://www.loc.gov/z3950/agency/bib1.html">
-     http://www.loc.gov/z3950/agency/bib1.html</ulink> 
-
-     http://www.loc.gov/z3950/agency/markup/13.html
-    
-  </para>
-  </sect1>
-
-
-These attribute types are recognized regardless of attribute set. Some are recognized for search, others for scan.
-
-Search
-
-Type   Name    Version
-7      Embedded Sort   1.1
-8      Term Set        1.1
-9      Rank weight     1.1
-9      Approx Limit    1.4
-10     Term Ref        1.4
-
-Embedded Sort
-
-The embedded sort is a way to specify sort within a query - thus removing the need to send a Sort Request separately. It is both faster and does not require clients that deal with the Sort Facility.
-
-The value after attribute type 7 is 1=ascending, 2=descending.. The attributes+term (APT) node is separate from the rest and must be @or'ed. The term associated with APT is the level .. 0=primary sort, 1=secondary sort etc.. Example:
-
-Search for water, sort by title (ascending):
-
-  @or @attr 1=1016 water @attr 7=1 @attr 1=4 0
-
-Search for water, sort by title ascending, then date descending:
-
-  @or @or @attr 1=1016 water @attr 7=1 @attr 1=4 0 @attr 7=2 @attr 1=30 1
-
-Term Set
-
-The Term Set feature is a facility that allows a search to store hitting terms in a "pseudo" resultset; thus a search (as usual) + a scan-like facility. Requires a client that can do named result sets since the search generates two result sets. The value for attribute 8 is the name of a result set (string). The terms in term set are returned as SUTRS records.
-
-Seach for u in title, right truncated.. Store result in result set named uset.
-
-  @attr 5=1 @attr 1=4 @attr 8=uset u
-
-The model as one serious flaw.. We don't know the size of term set.
-
-Rank weight
-
-Rank weight is a way to pass a value to a ranking algorithm - so that one APT has one value - while another as a different one.
-
-Search for utah in title with weight 30 as well as any with weight 20.
-
-  @attr 2=102 @or @attr 9=30 @attr 1=4 utah @attr 9=20 utah
-
-Approx Limit
-
-Newer Zebra versions normally estemiates hit count for every APT (leaf) in the query tree. These hit counts are returned as part of the searchResult-1 facility.
-
-By setting a limit for the APT we can make Zebra turn into approximate hit count when a certain hit count limit is reached. A value of zero means exact hit count.
-
-We are intersted in exact hit count for a, but for b we allow estimates for 1000 and higher..
-
-  @and a @attr 9=1000 b
-
-This facility clashes with rank weight! Fortunately this is a Zebra 1.4 thing so we can change this without upsetting anybody!
-
-Term Ref
-
-Zebra supports the searchResult-1 facility.
-
-If attribute 10 is given, that specifies a subqueryId value returned as part of the search result. It is a way for a client to name an APT part of a query.
-
-Scan
-
-Type   Name    Version
-8      Result set narrow       1.3
-9      Approx Limit    1.4
-
-Result set narrow
-
-If attribute 8 is given for scan, the value is the name of a result set. Each hit count in scan is @and'ed with the result set given.
-
-Approx limit
-
-The approx (as for search) is a way to enable approx hit counts for scan hit counts. However, it does NOT appear to work at the moment.
-
-
- AdamDickmeiss - 19 Dec 2005
-
-
--->
-
+    Starting with <literal>Zebra</literal> version 2.0.5 or newer, it is
+    possible to use a special element set which has the prefix
+    <literal>zebra::</literal>.
+   </para>
+   <para>
+    Using this element will, regardless of record type, return
+    Zebra's internal index structure/data for a record.
+    In particular, the regular record filters are not invoked when
+    these are in use.
+    This can in some cases make the retrival faster than regular
+    retrieval operations (for MARC, XML etc).
+   </para>
+   <table id="special-retrieval-types">
+    <title>Special Retrieval Elements</title>
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Element Set</entry>
+       <entry>Description</entry>
+       <entry>Syntax</entry>
+      </row>
+     </thead>
+     <tbody>
+      <row>
+       <entry><literal>zebra::meta::sysno</literal></entry>
+       <entry>Get Zebra record system ID</entry>
+       <entry>XML and SUTRS</entry>
+      </row>
+      <row>
+       <entry><literal>zebra::data</literal></entry>
+       <entry>Get raw record</entry>
+       <entry>all</entry>
+      </row>
+      <row>
+       <entry><literal>zebra::meta</literal></entry>
+       <entry>Get Zebra record internal metadata</entry>
+       <entry>XML and SUTRS</entry>
+      </row>
+      <row>
+       <entry><literal>zebra::index</literal></entry>
+       <entry>Get all indexed keys for record</entry>
+       <entry>XML and SUTRS</entry>
+      </row>
+      <row>
+       <entry>
+       <literal>zebra::index::</literal><replaceable>f</replaceable>
+       </entry>
+       <entry>
+       Get indexed keys for field <replaceable>f</replaceable> for record
+       </entry>
+       <entry>XML and SUTRS</entry>
+      </row>
+      <row>
+       <entry>
+       <literal>zebra::index::</literal><replaceable>f</replaceable>:<replaceable>t</replaceable>
+       </entry>
+       <entry>
+       Get indexed keys for field <replaceable>f</replaceable>
+         and type <replaceable>t</replaceable> for record
+       </entry>
+       <entry>XML and SUTRS</entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+   <para>
+    For example, to fetch the raw binary record data stored in the
+    zebra internal storage, or on the filesystem, the following
+    commands can be issued:
+    <screen>
+      Z> f @attr 1=title my
+      Z> format xml
+      Z> elements zebra::data
+      Z> s 1+1
+      Z> format sutrs
+      Z> s 1+1
+      Z> format usmarc
+      Z> s 1+1
+    </screen>
+    </para>
+   <para>
+    The special 
+    <literal>zebra::data</literal> element set name is 
+    defined for any record syntax, but will always fetch  
+    the raw record data in exactly the original form. No record syntax
+    specific transformations will be applied to the raw record data. 
+   </para>
+   <para>
+    Also, Zebra internal metadata about the record can be accessed: 
+    <screen>
+      Z> f @attr 1=title my
+      Z> format xml
+      Z> elements zebra::meta::sysno
+      Z> s 1+1
+    </screen> 
+    displays in <literal>XML</literal> record syntax only internal
+    record system number, whereas 
+    <screen>
+      Z> f @attr 1=title my
+      Z> format xml
+      Z> elements zebra::meta
+      Z> s 1+1
+    </screen> 
+    displays all available metadata on the record. These include sytem
+    number, database name,  indexed filename,  filter used for indexing,
+    score and static ranking information and finally bytesize of record.
+   </para>
+   <para>
+    Sometimes, it is very hard to figure out what exactly has been
+    indexed how and in which indexes. Using the indexing stylesheet of
+    the Alvis filter, one can at least see which portion of the record
+    went into which index, but a similar aid does not exist for all
+    other indexing filters.  
+   </para>
+   <para>
+    The special
+    <literal>zebra::index</literal> element set names are provided to
+    access information on per record indexed fields. For example, the
+    queries 
+    <screen>
+      Z> f @attr 1=title my
+      Z> format sutrs
+      Z> elements zebra::index
+      Z> s 1+1
+    </screen>
+    will display all indexed tokens from all indexed fields of the
+    first record, and it will display in <literal>SUTRS</literal>
+    record syntax, whereas 
+    <screen>
+      Z> f @attr 1=title my
+      Z> format xml
+      Z> elements zebra::index::title
+      Z> s 1+1
+      Z> elements zebra::index::title:p
+      Z> s 1+1
+    </screen> 
+    displays in <literal>XML</literal> record syntax only the content
+      of the zebra string index <literal>title</literal>, or
+      even only the type <literal>p</literal> phrase indexed part of it.
+   </para>
+   <note>
+    <para>
+     Trying to access numeric <literal>Bib-1</literal> use
+     attributes or trying to access non-existent zebra intern string
+     access points will result in a Diagnostic 25: Specified element set
+     'name not valid for specified database.
+    </para>
+   </note>
+  </section>
  
   </chapter>