perl/lib/IDZebra/Filter.pm

   1 #!/usr/bin/perl
   2 # ============================================================================
   3 # Zebra perl API header
   4 # =============================================================================
   5 use strict;
   6 use Carp;
   7 # ============================================================================
   8 package IDZebra::Filter;
   9 use IDZebra;
  10 use IDZebra::Data1;
  11
  12 BEGIN {
  13     IDZebra::init(); # ??? Do we need that at all (this is jus nmem init...)
  14 }
  15
  16 1;
  17 # -----------------------------------------------------------------------------
  18 # Class constructor
  19 # -----------------------------------------------------------------------------
  20 sub new {
  21     my ($proto,$context) = @_;
  22     my $class = ref($proto) || $proto;
  23     my $self = {};
  24     $self->{context} = $context;
  25     bless ($self, $class);
  26     return ($self);
  27 }
  28
  29 # -----------------------------------------------------------------------------
  30 # Callbacks
  31 # -----------------------------------------------------------------------------
  32 sub _process {
  33     my ($self) = @_;
  34     # This is ugly... could be passed as parameters... but didn't work.
  35     # I don't know why...
  36     my $dh  = IDZebra::grs_perl_get_dh($self->{context});
  37     my $mem = IDZebra::grs_perl_get_mem($self->{context});
  38     my $d1  = IDZebra::Data1->get($dh,$mem);
  39
  40     my $rootnode = $self->process($d1);
  41     IDZebra::grs_perl_set_res($self->{context},$rootnode);
  42     return (0);
  43 }
  44
  45 sub _store_buff {
  46     my ($self, $buff) = @_;
  47     $self->{_buff} = $buff;
  48 }
  49
  50 # -----------------------------------------------------------------------------
  51 # API Template - These methods should be overriden by the implementing class.
  52 # -----------------------------------------------------------------------------
  53 sub init {
  54     # This one is called once, when the module is loaded. Not in
  55     # object context yet!!!
  56 }
  57
  58 sub process {
  59     my ($self, $d1) = @_;
  60     # Just going to return a root node.
  61     return ($d1->mk_root('empty'));
  62 }
  63
  64 # -----------------------------------------------------------------------------
  65 # Testing
  66 # -----------------------------------------------------------------------------
  67 sub test {
  68     my ($proto, $file, %args) = @_;
  69
  70 #    print "Proto:$proto\n";
  71
  72     my $class = ref($proto) || $proto;
  73     my $self = {};
  74     bless ($self, $class);
  75
  76     my $th;
  77     open ($th, $file) || croak ("Cannot open $file");
  78
  79     $self->{testh} = $th;
  80
  81     my $m = IDZebra::nmem_create();
  82     my $d1=IDZebra::Data1->new($m,$IDZebra::DATA1_FLAG_XML);
  83     if ($args{tabPath}) { $d1->tabpath($args{tabPath}); }
  84     if ($args{tabRoot}) { $d1->tabroot($args{tabRoot}); }
  85
  86     my $rootnode = $self->process($d1);
  87     $d1->pr_tree($rootnode);
  88     $d1->free_tree($rootnode);
  89     $d1 = undef;
  90
  91     close ($th);
  92     $self->{testh} = undef;
  93
  94 }
  95
  96 # -----------------------------------------------------------------------------
  97 # Utility calls
  98 # -----------------------------------------------------------------------------
  99 sub readf {
 100     my ($self, $buff, $len, $offset) = @_;
 101     $buff = "";
 102     if ($self->{testh}) {
 103         return (read($self->{testh},$_[1],$len,$offset));
 104     } else {
 105         my $r = IDZebra::grs_perl_readf($self->{context},$len);
 106         if ($r > 0) {
 107             $buff = $self->{_buff};
 108             $self->{_buff} = undef;
 109         }
 110         return ($r);
 111     }
 112 }
 113
 114 sub readall {
 115     my ($self, $buffsize) = @_;
 116     my $r;
 117     my $res = "";
 118
 119     do {
 120         if ($self->{testh}) {
 121             $r = read($self->{testh}, $self->{_buff}, $buffsize);
 122         } else {
 123             $r = IDZebra::grs_perl_readf($self->{context},$buffsize);
 124         }
 125         if ($r > 0) {
 126             $res .= $self->{_buff};
 127             $self->{_buff} = undef;
 128         }
 129     } until ($r <= 0);
 130
 131     return ($res);
 132 }
 133
 134 sub seekf {
 135     my ($self, $offset) = @_;
 136     if ($self->{testh}) {
 137         # I'm not sure if offset is absolute or relative here...
 138         return (seek ($self->{testh}, $offset, $0));
 139     } else {
 140         return (IDZebra::grs_perl_seekf($self->{context},$offset)) ;
 141     }
 142 }
 143
 144 sub tellf {
 145     my ($self) = @_;
 146     if ($self->{testh}) {
 147         # Not implemented
 148     } else {
 149         return (IDZebra::grs_perl_seekf($self->{context}));
 150     }
 151 }
 152
 153 sub endf {
 154     my ($self, $offset) = @_;
 155     if ($self->{testh}) {
 156         # Not implemented
 157     } else {
 158         IDZebra::grs_perl_endf($self->{context},$offset);
 159     }
 160 }
 161
 162 __END__
 163
 164 =head1 NAME
 165
 166 IDZebra::Filter - A superclass of perl filters for Zebra
 167
 168 =head1 SYNOPSIS
 169
 170    package MyFilter;
 171
 172    use IDZebra::Filter;
 173    our @ISA=qw(IDZebra::Filter);
 174
 175    ...
 176
 177    sub init {
 178
 179    }
 180
 181    sub process {
 182        my ($self,$d1) = @_;
 183        my $rootnode=$d1->mk_root('meta');
 184        ...
 185        return ($rootnode)
 186    }
 187
 188 =head1 DESCRIPTION
 189
 190 When Zebra is trying to index/present a record, needs to extract information from it's source. For some types of input, "hardcoded" procedures are defined, but you have the opportunity to write your own filter code in Tcl or in perl.
 191
 192 The perl implementation is nothing but a package, deployed in some available location for Zebra (in I<profilePath>, or in PERL_INCLUDE (@INC)). This package is interpreted once needed, a filter object is created, armored with knowledge enough, to read data from the source, and to generate data1 structures as result. For each individual source "files" the  process method is called.
 193
 194 This class is supposed to be inherited in all perl filter classes, as it is providing a way of communication between the filter code and the indexing/retrieval process.
 195
 196 =head1 IMPLEMENTING FILTERS IN PERL
 197
 198 All you have to do is to create a perl package, using and inheriting this one (IDZebra::Filter), and implement the "process" method. The parameter of this call is an IDZebra::Data1 object, representing a data1 handle. You can use this, to reach the configuration information and to build your data structure. What you have to return is a data1 root node. To create it:
 199
 200    my $rootnode=$d1->mk_root('meta');
 201
 202 where 'meta' is the abstract syntax identifier (in this case Zebra will try to locate meta.abs, and apply it). Then just continue to build the structure. See i<IDZebra::Data1> for details.
 203
 204 In order to get the input stream, you can use "virtual" file operators (as the source is not necessairly a file):
 205
 206 =item readf($buf,$len,$offset)
 207
 208 Going to read $len bytes of data from offset $offset into $buff
 209
 210 =item readall($bufflen)
 211
 212 Read the entire stream, by reading $bufflen bytes at once
 213
 214 =item seekf($offset)
 215
 216 Position to $offset
 217
 218 =item tellf
 219
 220 Tells the current offset (?)
 221
 222 =item endf($offset)
 223
 224 ???
 225
 226 Optionally, you can implement an init call for your class. This call is not going to be called in object, but in class context. Stupid, eh?
 227
 228 =head1 TEST YOUR PERL FILTER
 229
 230 You can check the functionality of your filter code, by writing a small test program like
 231
 232
 233    use pod;
 234    $res =pod->test($ARGV[0],
 235                    (tabPath=>'.:../../tab:../../../yaz/tab'));
 236
 237 This will try to apply the filter on the file provided as argument, and display the generated data1 structure. However, there are some differences, when running a filter in test mode:
 238  - The include path is not applied from tabPath
 239  - the tellf, and endf functions are not implemented (just ignored)
 240
 241 =head1 CONFIGURE ZEBRA TO USE A PERL FILTER
 242
 243 This is quite simple. Read the Zebra manual, and follow the instructions to create your zebra.cfg. For your I<recordType> choose 'grs.perl.<YourFilterClass>'.
 244 Copy your filter module (YourFilterClass.pm) to a directory listed in I<profilePath>. i<profilePath> is added to @INC, when interpreting your package: so if you need to load modules from different locations than the default perl include path, just add these directories.
 245
 246 =head1 COPYRIGHT
 247
 248 Fill in
 249
 250 =head1 AUTHOR
 251
 252 Peter Popovics, pop@technomat.hu
 253
 254 =head1 SEE ALSO
 255
 256 IDZebra, IDZebra::Data1, Zebra documentation
 257
 258 =cut