perl/lib/IDZebra/Filter.pm

   1 #!/usr/bin/perl
   2 # ============================================================================
   3 # Zebra perl API header
   4 # =============================================================================
   5 use strict;
   6 use Carp;
   7 # ============================================================================
   8 package IDZebra::Filter;
   9 use IDZebra;
  10 use IDZebra::Data1;
  11 use IDZebra::Logger qw(:flags :calls);
  12 use Symbol qw(gensym);
  13 #use Devel::Leak;
  14
  15 our $SAFE_MODE = 1;
  16
  17 BEGIN {
  18     IDZebra::init(); # ??? Do we need that at all (this is jus nmem init...)
  19 }
  20
  21 1;
  22 # -----------------------------------------------------------------------------
  23 # Class constructor
  24 # -----------------------------------------------------------------------------
  25 sub new {
  26     my ($proto,$context) = @_;
  27     my $class = ref($proto) || $proto;
  28     my $self = {};
  29     $self->{context} = $context;
  30     bless ($self, $class);
  31     return ($self);
  32 }
  33
  34 # -----------------------------------------------------------------------------
  35 # Callbacks
  36 # -----------------------------------------------------------------------------
  37 sub _process {
  38     my ($self) = @_;
  39
  40 #    if ($self->{dl}) {
  41 #       print STDERR "LEAK",Devel::Leak::CheckSV($self->{dl}),"\n";
  42 #    }
  43
  44 #    print STDERR "LEAK",Devel::Leak::NoteSV($self->{dl}),"\n";
  45
  46     # This is ugly... could be passed as parameters... but didn't work.
  47     # I don't know why...
  48     my $dh  = IDZebra::grs_perl_get_dh($self->{context});
  49     my $mem = IDZebra::grs_perl_get_mem($self->{context});
  50     my $d1  = IDZebra::Data1->get($dh,$mem);
  51
  52     my $rootnode;
  53     if ($SAFE_MODE) {
  54         eval {$rootnode = $self->process($d1)};
  55         if ($@) {
  56             logf(LOG_WARN,"Error processing perl filter:%s\n",$@);
  57         }
  58     } else {
  59         $rootnode = $self->process($d1);
  60     }
  61     IDZebra::grs_perl_set_res($self->{context},$rootnode);
  62     return (0);
  63 }
  64
  65 sub _store_buff {
  66     my ($self, $buff) = @_;
  67     $self->{_buff} = $buff;
  68 }
  69
  70 # -----------------------------------------------------------------------------
  71 # API Template - These methods should be overriden by the implementing class.
  72 # -----------------------------------------------------------------------------
  73 sub init {
  74     # This one is called once, when the module is loaded. Not in
  75     # object context yet!!!
  76 }
  77
  78 sub process {
  79     my ($self, $d1) = @_;
  80     # Just going to return a root node.
  81     return ($d1->mk_root('empty'));
  82 }
  83
  84 # -----------------------------------------------------------------------------
  85 # Testing
  86 # -----------------------------------------------------------------------------
  87 sub test {
  88     my ($proto, $file, %args) = @_;
  89
  90     my $class = ref($proto) || $proto;
  91     my $self = {};
  92     bless ($self, $class);
  93
  94     my $th;
  95     open ($th, $file) || croak ("Cannot open $file");
  96
  97     $self->{testh} = $th;
  98
  99     my $m = IDZebra::nmem_create();
 100     my $d1=IDZebra::Data1->new($m,$IDZebra::DATA1_FLAG_XML);
 101     if ($args{tabPath}) { $d1->tabpath($args{tabPath}); }
 102     if ($args{tabRoot}) { $d1->tabroot($args{tabRoot}); }
 103
 104     my $rootnode = $self->process($d1);
 105     $d1->pr_tree($rootnode);
 106     $d1->free_tree($rootnode);
 107     $d1 = undef;
 108
 109     close ($th);
 110     $self->{testh} = undef;
 111
 112 }
 113
 114 # -----------------------------------------------------------------------------
 115 # Utility calls
 116 # -----------------------------------------------------------------------------
 117 sub readf {
 118     my ($self, $buff, $len, $offset) = @_;
 119     $buff = "";
 120     if ($self->{testh}) {
 121         return (read($self->{testh},$_[1],$len,$offset));
 122     } else {
 123         my $r = IDZebra::grs_perl_readf($self->{context},$len);
 124         if ($r > 0) {
 125             $buff = $self->{_buff};
 126             $self->{_buff} = undef;
 127         }
 128         return ($r);
 129     }
 130 }
 131
 132 sub readline {
 133     my ($self) = @_;
 134
 135     my $r = IDZebra::grs_perl_readline($self->{context});
 136     if ($r > 0) {
 137         my $buff = $self->{_buff};
 138         $self->{_buff} = undef;
 139         return ($buff);
 140     }
 141     return (undef);
 142 }
 143
 144 sub getc {
 145     my ($self) = @_;
 146     return(IDZebra::grs_perl_getc($self->{context}));
 147 }
 148
 149 sub get_fh {
 150     my ($self) = @_;
 151     my $fh = gensym;
 152     tie (*$fh,'IDZebra::FilterFile', $self);
 153     return ($fh);
 154 }
 155
 156 sub readall {
 157     my ($self, $buffsize) = @_;
 158     my $r;
 159     my $res = "";
 160
 161     do {
 162         if ($self->{testh}) {
 163             $r = read($self->{testh}, $self->{_buff}, $buffsize);
 164         } else {
 165             $r = IDZebra::grs_perl_readf($self->{context},$buffsize);
 166         }
 167         if ($r > 0) {
 168             $res .= $self->{_buff};
 169             $self->{_buff} = undef;
 170         }
 171     } until ($r <= 0);
 172
 173     return ($res);
 174 }
 175
 176 sub seekf {
 177     my ($self, $offset) = @_;
 178     if ($self->{testh}) {
 179         # I'm not sure if offset is absolute or relative here...
 180         return (seek ($self->{testh}, $offset, $0));
 181     } else {
 182         return (IDZebra::grs_perl_seekf($self->{context},$offset)) ;
 183     }
 184 }
 185
 186 sub tellf {
 187     my ($self) = @_;
 188     if ($self->{testh}) {
 189         # Not implemented
 190     } else {
 191         return (IDZebra::grs_perl_seekf($self->{context}));
 192     }
 193 }
 194
 195 sub endf {
 196     my ($self, $offset) = @_;
 197     if ($self->{testh}) {
 198         # Not implemented
 199     } else {
 200         IDZebra::grs_perl_endf($self->{context},$offset);
 201     }
 202 }
 203 # ----------------------------------------------------------------------------
 204 # The 'virtual' filehandle for zebra extract calls
 205 # ----------------------------------------------------------------------------
 206 package IDZebra::FilterFile;
 207 require Tie::Handle;
 208
 209 our @ISA = qw(Tie::Handle);
 210
 211 sub TIEHANDLE {
 212     my $class = shift;
 213     my $self = {};
 214     bless ($self, $class);
 215     $self->{filter} = shift;
 216     return ($self);
 217 }
 218
 219 sub READ {
 220     my $self = shift;
 221     return ($self->{filter}->readf(@_));
 222 }
 223
 224 sub READLINE {
 225     my $self = shift;
 226     return ($self->{filter}->readline());
 227 }
 228
 229 sub GETC {
 230     my $self = shift;
 231     return ($self->{filter}->getc());
 232 }
 233
 234 sub EOF {
 235     croak ("EOF not implemented");
 236 }
 237
 238 sub TELL {
 239     croak ("TELL not implemented");
 240 }
 241
 242 sub SEEK {
 243     croak ("SEEK not implemented");
 244 }
 245
 246 sub CLOSE {
 247     my $self = shift;
 248 }
 249
 250
 251 __END__
 252
 253 =head1 NAME
 254
 255 IDZebra::Filter - A superclass of perl filters for Zebra
 256
 257 =head1 SYNOPSIS
 258
 259    package MyFilter;
 260
 261    use IDZebra::Filter;
 262    our @ISA=qw(IDZebra::Filter);
 263
 264    ...
 265
 266    sub init {
 267
 268    }
 269
 270    sub process {
 271        my ($self,$d1) = @_;
 272        my $rootnode=$d1->mk_root('meta');
 273        ...
 274        return ($rootnode)
 275    }
 276
 277 =head1 DESCRIPTION
 278
 279 When Zebra is trying to index/present a record, needs to extract information from it's source. For some types of input, "hardcoded" procedures are defined, but you have the opportunity to write your own filter code in Tcl or in perl.
 280
 281 The perl implementation is nothing but a package, deployed in some available location for Zebra (in I<profilePath>, or in PERL_INCLUDE (@INC)). This package is interpreted once needed, a filter object is created, armored with knowledge enough, to read data from the source, and to generate data1 structures as result. For each individual source "files" the  process method is called.
 282
 283 This class is supposed to be inherited in all perl filter classes, as it is providing a way of communication between the filter code and the indexing/retrieval process.
 284
 285 =head1 IMPLEMENTING FILTERS IN PERL
 286
 287 All you have to do is to create a perl package, using and inheriting this one (IDZebra::Filter), and implement the "process" method. The parameter of this call is an IDZebra::Data1 object, representing a data1 handle. You can use this, to reach the configuration information and to build your data structure. What you have to return is a data1 root node. To create it:
 288
 289    my $rootnode=$d1->mk_root('meta');
 290
 291 where 'meta' is the abstract syntax identifier (in this case Zebra will try to locate meta.abs, and apply it). Then just continue to build the structure. See i<IDZebra::Data1> for details.
 292
 293 In order to get the input stream, you can use "virtual" file operators (as the source is not necessairly a file):
 294
 295 =item readf($buf,$len,$offset)
 296
 297 Going to read $len bytes of data from offset $offset into $buff
 298
 299 =item readall($bufflen)
 300
 301 Read the entire stream, by reading $bufflen bytes at once
 302
 303 =item seekf($offset)
 304
 305 Position to $offset
 306
 307 =item tellf
 308
 309 Tells the current offset (?)
 310
 311 =item endf($offset)
 312
 313 ???
 314
 315 Optionally, you can implement an init call for your class. This call is not going to be called in object, but in class context. Stupid, eh?
 316
 317 =head1 TEST YOUR PERL FILTER
 318
 319 You can check the functionality of your filter code, by writing a small test program like
 320
 321
 322    use pod;
 323    $res =pod->test($ARGV[0],
 324                    (tabPath=>'.:../../tab:../../../yaz/tab'));
 325
 326 This will try to apply the filter on the file provided as argument, and display the generated data1 structure. However, there are some differences, when running a filter in test mode:
 327  - The include path is not applied from tabPath
 328  - the tellf, and endf functions are not implemented (just ignored)
 329
 330 =head1 CONFIGURE ZEBRA TO USE A PERL FILTER
 331
 332 This is quite simple. Read the Zebra manual, and follow the instructions to create your zebra.cfg. For your I<recordType> choose 'grs.perl.<YourFilterClass>'.
 333 Copy your filter module (YourFilterClass.pm) to a directory listed in I<profilePath>. i<profilePath> is added to @INC, when interpreting your package: so if you need to load modules from different locations than the default perl include path, just add these directories.
 334
 335 =head1 COPYRIGHT
 336
 337 Fill in
 338
 339 =head1 AUTHOR
 340
 341 Peter Popovics, pop@technomat.hu
 342
 343 =head1 SEE ALSO
 344
 345 IDZebra, IDZebra::Data1, Zebra documentation
 346
 347 =cut