perl/lib/IDZebra/Filter.pm

   1 #!/usr/bin/perl
   2 # ============================================================================
   3 # Zebra perl API header
   4 # =============================================================================
   5 use strict;
   6 use Carp;
   7 # ============================================================================
   8 package IDZebra::Filter;
   9 use IDZebra;
  10 use IDZebra::Data1;
  11 use IDZebra::Logger qw(:flags :calls);
  12 use Devel::Leak;
  13
  14 our $SAFE_MODE = 1;
  15
  16 BEGIN {
  17     IDZebra::init(); # ??? Do we need that at all (this is jus nmem init...)
  18 }
  19
  20 1;
  21 # -----------------------------------------------------------------------------
  22 # Class constructor
  23 # -----------------------------------------------------------------------------
  24 sub new {
  25     my ($proto,$context) = @_;
  26     my $class = ref($proto) || $proto;
  27     my $self = {};
  28     $self->{context} = $context;
  29     bless ($self, $class);
  30     return ($self);
  31 }
  32
  33 # -----------------------------------------------------------------------------
  34 # Callbacks
  35 # -----------------------------------------------------------------------------
  36 sub _process {
  37     my ($self) = @_;
  38
  39 #    if ($self->{dl}) {
  40 #       print STDERR "LEAK",Devel::Leak::CheckSV($self->{dl}),"\n";
  41 #    }
  42
  43 #    print STDERR "LEAK",Devel::Leak::NoteSV($self->{dl}),"\n";
  44
  45     # This is ugly... could be passed as parameters... but didn't work.
  46     # I don't know why...
  47     my $dh  = IDZebra::grs_perl_get_dh($self->{context});
  48     my $mem = IDZebra::grs_perl_get_mem($self->{context});
  49     my $d1  = IDZebra::Data1->get($dh,$mem);
  50
  51     my $rootnode;
  52     if ($SAFE_MODE) {
  53         eval {$rootnode = $self->process($d1)};
  54         if ($@) {
  55             logf(LOG_WARN,"Error processing perl filter:%s\n",$@);
  56         }
  57     } else {
  58         $rootnode = $self->process($d1);
  59     }
  60     IDZebra::grs_perl_set_res($self->{context},$rootnode);
  61     return (0);
  62 }
  63
  64 sub _store_buff {
  65     my ($self, $buff) = @_;
  66     $self->{_buff} = $buff;
  67 }
  68
  69 # -----------------------------------------------------------------------------
  70 # API Template - These methods should be overriden by the implementing class.
  71 # -----------------------------------------------------------------------------
  72 sub init {
  73     # This one is called once, when the module is loaded. Not in
  74     # object context yet!!!
  75 }
  76
  77 sub process {
  78     my ($self, $d1) = @_;
  79     # Just going to return a root node.
  80     return ($d1->mk_root('empty'));
  81 }
  82
  83 # -----------------------------------------------------------------------------
  84 # Testing
  85 # -----------------------------------------------------------------------------
  86 sub test {
  87     my ($proto, $file, %args) = @_;
  88
  89 #    print "Proto:$proto\n";
  90
  91     my $class = ref($proto) || $proto;
  92     my $self = {};
  93     bless ($self, $class);
  94
  95     my $th;
  96     open ($th, $file) || croak ("Cannot open $file");
  97
  98     $self->{testh} = $th;
  99
 100     my $m = IDZebra::nmem_create();
 101     my $d1=IDZebra::Data1->new($m,$IDZebra::DATA1_FLAG_XML);
 102     if ($args{tabPath}) { $d1->tabpath($args{tabPath}); }
 103     if ($args{tabRoot}) { $d1->tabroot($args{tabRoot}); }
 104
 105     my $rootnode = $self->process($d1);
 106     $d1->pr_tree($rootnode);
 107     $d1->free_tree($rootnode);
 108     $d1 = undef;
 109
 110     close ($th);
 111     $self->{testh} = undef;
 112
 113 }
 114
 115 # -----------------------------------------------------------------------------
 116 # Utility calls
 117 # -----------------------------------------------------------------------------
 118 sub readf {
 119     my ($self, $buff, $len, $offset) = @_;
 120     $buff = "";
 121     if ($self->{testh}) {
 122         return (read($self->{testh},$_[1],$len,$offset));
 123     } else {
 124         my $r = IDZebra::grs_perl_readf($self->{context},$len);
 125         if ($r > 0) {
 126             $buff = $self->{_buff};
 127             $self->{_buff} = undef;
 128         }
 129         return ($r);
 130     }
 131 }
 132
 133 sub readall {
 134     my ($self, $buffsize) = @_;
 135     my $r;
 136     my $res = "";
 137
 138     do {
 139         if ($self->{testh}) {
 140             $r = read($self->{testh}, $self->{_buff}, $buffsize);
 141         } else {
 142             $r = IDZebra::grs_perl_readf($self->{context},$buffsize);
 143         }
 144         if ($r > 0) {
 145             $res .= $self->{_buff};
 146             $self->{_buff} = undef;
 147         }
 148     } until ($r <= 0);
 149
 150     return ($res);
 151 }
 152
 153 sub seekf {
 154     my ($self, $offset) = @_;
 155     if ($self->{testh}) {
 156         # I'm not sure if offset is absolute or relative here...
 157         return (seek ($self->{testh}, $offset, $0));
 158     } else {
 159         return (IDZebra::grs_perl_seekf($self->{context},$offset)) ;
 160     }
 161 }
 162
 163 sub tellf {
 164     my ($self) = @_;
 165     if ($self->{testh}) {
 166         # Not implemented
 167     } else {
 168         return (IDZebra::grs_perl_seekf($self->{context}));
 169     }
 170 }
 171
 172 sub endf {
 173     my ($self, $offset) = @_;
 174     if ($self->{testh}) {
 175         # Not implemented
 176     } else {
 177         IDZebra::grs_perl_endf($self->{context},$offset);
 178     }
 179 }
 180
 181 __END__
 182
 183 =head1 NAME
 184
 185 IDZebra::Filter - A superclass of perl filters for Zebra
 186
 187 =head1 SYNOPSIS
 188
 189    package MyFilter;
 190
 191    use IDZebra::Filter;
 192    our @ISA=qw(IDZebra::Filter);
 193
 194    ...
 195
 196    sub init {
 197
 198    }
 199
 200    sub process {
 201        my ($self,$d1) = @_;
 202        my $rootnode=$d1->mk_root('meta');
 203        ...
 204        return ($rootnode)
 205    }
 206
 207 =head1 DESCRIPTION
 208
 209 When Zebra is trying to index/present a record, needs to extract information from it's source. For some types of input, "hardcoded" procedures are defined, but you have the opportunity to write your own filter code in Tcl or in perl.
 210
 211 The perl implementation is nothing but a package, deployed in some available location for Zebra (in I<profilePath>, or in PERL_INCLUDE (@INC)). This package is interpreted once needed, a filter object is created, armored with knowledge enough, to read data from the source, and to generate data1 structures as result. For each individual source "files" the  process method is called.
 212
 213 This class is supposed to be inherited in all perl filter classes, as it is providing a way of communication between the filter code and the indexing/retrieval process.
 214
 215 =head1 IMPLEMENTING FILTERS IN PERL
 216
 217 All you have to do is to create a perl package, using and inheriting this one (IDZebra::Filter), and implement the "process" method. The parameter of this call is an IDZebra::Data1 object, representing a data1 handle. You can use this, to reach the configuration information and to build your data structure. What you have to return is a data1 root node. To create it:
 218
 219    my $rootnode=$d1->mk_root('meta');
 220
 221 where 'meta' is the abstract syntax identifier (in this case Zebra will try to locate meta.abs, and apply it). Then just continue to build the structure. See i<IDZebra::Data1> for details.
 222
 223 In order to get the input stream, you can use "virtual" file operators (as the source is not necessairly a file):
 224
 225 =item readf($buf,$len,$offset)
 226
 227 Going to read $len bytes of data from offset $offset into $buff
 228
 229 =item readall($bufflen)
 230
 231 Read the entire stream, by reading $bufflen bytes at once
 232
 233 =item seekf($offset)
 234
 235 Position to $offset
 236
 237 =item tellf
 238
 239 Tells the current offset (?)
 240
 241 =item endf($offset)
 242
 243 ???
 244
 245 Optionally, you can implement an init call for your class. This call is not going to be called in object, but in class context. Stupid, eh?
 246
 247 =head1 TEST YOUR PERL FILTER
 248
 249 You can check the functionality of your filter code, by writing a small test program like
 250
 251
 252    use pod;
 253    $res =pod->test($ARGV[0],
 254                    (tabPath=>'.:../../tab:../../../yaz/tab'));
 255
 256 This will try to apply the filter on the file provided as argument, and display the generated data1 structure. However, there are some differences, when running a filter in test mode:
 257  - The include path is not applied from tabPath
 258  - the tellf, and endf functions are not implemented (just ignored)
 259
 260 =head1 CONFIGURE ZEBRA TO USE A PERL FILTER
 261
 262 This is quite simple. Read the Zebra manual, and follow the instructions to create your zebra.cfg. For your I<recordType> choose 'grs.perl.<YourFilterClass>'.
 263 Copy your filter module (YourFilterClass.pm) to a directory listed in I<profilePath>. i<profilePath> is added to @INC, when interpreting your package: so if you need to load modules from different locations than the default perl include path, just add these directories.
 264
 265 =head1 COPYRIGHT
 266
 267 Fill in
 268
 269 =head1 AUTHOR
 270
 271 Peter Popovics, pop@technomat.hu
 272
 273 =head1 SEE ALSO
 274
 275 IDZebra, IDZebra::Data1, Zebra documentation
 276
 277 =cut