perl/lib/IDZebra/Filter.pm

   1 #!/usr/bin/perl
   2 # ============================================================================
   3 # Zebra perl API header
   4 # =============================================================================
   5 use strict;
   6 use Carp;
   7 # ============================================================================
   8 package IDZebra::Filter;
   9 use IDZebra;
  10 use IDZebra::Data1;
  11 use IDZebra::Logger qw(:flags :calls);
  12 use Symbol qw(gensym);
  13 #use Devel::Leak;
  14
  15 our $SAFE_MODE = 1;
  16
  17 BEGIN {
  18     IDZebra::init(); # ??? Do we need that at all (this is jus nmem init...)
  19 }
  20
  21 1;
  22 # -----------------------------------------------------------------------------
  23 # Class constructor
  24 # -----------------------------------------------------------------------------
  25 sub new {
  26     my ($proto,$context) = @_;
  27     my $class = ref($proto) || $proto;
  28     my $self = {};
  29     $self->{context} = $context;
  30     bless ($self, $class);
  31     return ($self);
  32 }
  33
  34 # -----------------------------------------------------------------------------
  35 # Callbacks
  36 # -----------------------------------------------------------------------------
  37 sub _process {
  38     my ($self) = @_;
  39
  40 #    if ($self->{dl}) {
  41 #       print STDERR "LEAK",Devel::Leak::CheckSV($self->{dl}),"\n";
  42 #    }
  43
  44 #    print STDERR "LEAK",Devel::Leak::NoteSV($self->{dl}),"\n";
  45
  46     # This is ugly... could be passed as parameters... but didn't work.
  47     # I don't know why...
  48     my $dh  = IDZebra::grs_perl_get_dh($self->{context});
  49     my $mem = IDZebra::grs_perl_get_mem($self->{context});
  50     my $d1  = IDZebra::Data1->get($dh,$mem);
  51
  52     my $rootnode;
  53     if ($SAFE_MODE) {
  54         eval {$rootnode = $self->process($d1)};
  55         if ($@) {
  56             logf(LOG_WARN,"Error processing perl filter:%s\n",$@);
  57         }
  58     } else {
  59         $rootnode = $self->process($d1);
  60     }
  61     IDZebra::grs_perl_set_res($self->{context},$rootnode);
  62     return (0);
  63 }
  64
  65 sub _store_buff {
  66     my ($self, $buff) = @_;
  67     $self->{_buff} = $buff;
  68 }
  69
  70 # -----------------------------------------------------------------------------
  71 # API Template - These methods should be overriden by the implementing class.
  72 # -----------------------------------------------------------------------------
  73 sub init {
  74     # This one is called once, when the module is loaded. Not in
  75     # object context yet!!!
  76 }
  77
  78 sub process {
  79     my ($self, $d1) = @_;
  80     # Just going to return a root node.
  81     return ($d1->mk_root('empty'));
  82 }
  83
  84 # -----------------------------------------------------------------------------
  85 # Testing
  86 # -----------------------------------------------------------------------------
  87 sub test {
  88     my ($proto, $file, %args) = @_;
  89
  90     my $class = ref($proto) || $proto;
  91     my $self = {};
  92     bless ($self, $class);
  93
  94     my $th;
  95     open ($th, $file) || croak ("Cannot open $file");
  96
  97     $self->{testh} = $th;
  98
  99     my $m = IDZebra::nmem_create();
 100     my $d1=IDZebra::Data1->new($m,$IDZebra::DATA1_FLAG_XML);
 101     if ($args{tabPath}) { $d1->tabpath($args{tabPath}); }
 102     if ($args{tabRoot}) { $d1->tabroot($args{tabRoot}); }
 103
 104     my $rootnode = $self->process($d1);
 105     $d1->pr_tree($rootnode);
 106     $d1->free_tree($rootnode);
 107     $d1 = undef;
 108
 109     close ($th);
 110     $self->{testh} = undef;
 111
 112 }
 113
 114 # -----------------------------------------------------------------------------
 115 # Utility calls
 116 # -----------------------------------------------------------------------------
 117 sub readf {
 118     my ($self, $buff, $len, $offset) = @_;
 119     $buff = "";
 120     if ($self->{testh}) {
 121         return (read($self->{testh},$_[1],$len,$offset));
 122     } else {
 123         my $r = IDZebra::grs_perl_readf($self->{context},$len);
 124         if ($r > 0) {
 125             $buff = $self->{_buff};
 126             $self->{_buff} = undef;
 127         }
 128         return ($r);
 129     }
 130 }
 131
 132 sub readline {
 133     my ($self) = @_;
 134
 135     my $r = IDZebra::grs_perl_readline($self->{context});
 136     if ($r > 0) {
 137         my $buff = $self->{_buff};
 138         $self->{_buff} = undef;
 139         return ($buff);
 140     }
 141     return (undef);
 142 }
 143
 144 sub getc {
 145     my ($self) = @_;
 146     return(IDZebra::grs_perl_getc($self->{context}));
 147 }
 148
 149 sub get_fh {
 150     my ($self) = @_;
 151     if ($self->{testh}) {
 152         return ($self->{testh});
 153     } else {
 154         my $fh = gensym;
 155         tie (*$fh,'IDZebra::FilterFile', $self);
 156         return ($fh);
 157     }
 158 }
 159
 160 sub readall {
 161     my ($self, $buffsize) = @_;
 162     my $r;
 163     my $res = "";
 164
 165     do {
 166         if ($self->{testh}) {
 167             $r = read($self->{testh}, $self->{_buff}, $buffsize);
 168         } else {
 169             $r = IDZebra::grs_perl_readf($self->{context},$buffsize);
 170         }
 171         if ($r > 0) {
 172             $res .= $self->{_buff};
 173             $self->{_buff} = undef;
 174         }
 175     } until ($r <= 0);
 176
 177     return ($res);
 178 }
 179
 180 sub seekf {
 181     my ($self, $offset) = @_;
 182     if ($self->{testh}) {
 183         # I'm not sure if offset is absolute or relative here...
 184         return (seek ($self->{testh}, $offset, $0));
 185     } else {
 186         return (IDZebra::grs_perl_seekf($self->{context},$offset)) ;
 187     }
 188 }
 189
 190 sub tellf {
 191     my ($self) = @_;
 192     if ($self->{testh}) {
 193         # Not implemented
 194     } else {
 195         return (IDZebra::grs_perl_seekf($self->{context}));
 196     }
 197 }
 198
 199 sub endf {
 200     my ($self, $offset) = @_;
 201     if ($self->{testh}) {
 202         # Not implemented
 203     } else {
 204         IDZebra::grs_perl_endf($self->{context},$offset);
 205     }
 206 }
 207 # ----------------------------------------------------------------------------
 208 # The 'virtual' filehandle for zebra extract calls
 209 # ----------------------------------------------------------------------------
 210 package IDZebra::FilterFile;
 211 require Tie::Handle;
 212
 213 our @ISA = qw(Tie::Handle);
 214
 215 sub TIEHANDLE {
 216     my $class = shift;
 217     my $self = {};
 218     bless ($self, $class);
 219     $self->{filter} = shift;
 220     return ($self);
 221 }
 222
 223 sub READ {
 224     my $self = shift;
 225     return ($self->{filter}->readf(@_));
 226 }
 227
 228 sub READLINE {
 229     my $self = shift;
 230     return ($self->{filter}->readline());
 231 }
 232
 233 sub GETC {
 234     my $self = shift;
 235     return ($self->{filter}->getc());
 236 }
 237
 238 sub EOF {
 239     croak ("EOF not implemented");
 240 }
 241
 242 sub TELL {
 243     croak ("TELL not implemented");
 244 }
 245
 246 sub SEEK {
 247     croak ("SEEK not implemented");
 248 }
 249
 250 sub CLOSE {
 251     my $self = shift;
 252 }
 253
 254
 255 __END__
 256
 257 =head1 NAME
 258
 259 IDZebra::Filter - A superclass of perl filters for Zebra
 260
 261 =head1 SYNOPSIS
 262
 263    package MyFilter;
 264
 265    use IDZebra::Filter;
 266    our @ISA=qw(IDZebra::Filter);
 267
 268    ...
 269
 270    sub init {
 271
 272    }
 273
 274    sub process {
 275        my ($self,$d1) = @_;
 276        my $rootnode=$d1->mk_root('meta');
 277        ...
 278        return ($rootnode)
 279    }
 280
 281 =head1 DESCRIPTION
 282
 283 When Zebra is trying to index/present a record, needs to extract information from it's source. For some types of input, "hardcoded" procedures are defined, but you have the opportunity to write your own filter code in Tcl or in perl.
 284
 285 The perl implementation is nothing but a package, deployed in some available location for Zebra (in I<profilePath>, or in PERL_INCLUDE (@INC)). This package is interpreted once needed, a filter object is created, armored with knowledge enough, to read data from the source, and to generate data1 structures as result. For each individual source "files" the  process method is called.
 286
 287 This class is supposed to be inherited in all perl filter classes, as it is providing a way of communication between the filter code and the indexing/retrieval process.
 288
 289 =head1 IMPLEMENTING FILTERS IN PERL
 290
 291 All you have to do is to create a perl package, using and inheriting this one (IDZebra::Filter), and implement the "process" method. The parameter of this call is an IDZebra::Data1 object, representing a data1 handle. You can use this, to reach the configuration information and to build your data structure. What you have to return is a data1 root node. To create it:
 292
 293    my $rootnode=$d1->mk_root('meta');
 294
 295 where 'meta' is the abstract syntax identifier (in this case Zebra will try to locate meta.abs, and apply it). Then just continue to build the structure. See i<IDZebra::Data1> for details.
 296
 297 In order to get the input stream, you can use "virtual" file operators (as the source is not necessairly a file):
 298
 299 =over 4
 300
 301 =item B<readf($buf,$len,$offset)>
 302
 303 Going to read $len bytes of data from offset $offset into $buff
 304
 305 =item B<readline()>
 306
 307 Read one line
 308
 309 =item B<getc()>
 310
 311 Get one character (byte)
 312
 313 =item B<readall($bufflen)>
 314
 315 Read the entire stream, by reading $bufflen bytes at once
 316
 317 =item B<seekf($offset)>
 318
 319 Position to $offset
 320
 321 =item B<tellf()>
 322
 323 Tells the current offset (?)
 324
 325 =item B<endf($offset)>
 326
 327 ???
 328
 329 =back
 330
 331 You can optionally get a virtual perl filehandle as well:
 332
 333   my $fh = $self->get_fh();
 334   while (<$fh>) {
 335     # ...
 336   }
 337
 338 Note, that the virtual filehandle implementation is not finished yet, so some applications may have problems using that. See TODO.
 339
 340 You can implement an init call for your class. This call is not going to be called in object, but in class context. Stupid, eh?
 341
 342 =head1 TEST YOUR PERL FILTER
 343
 344 You can check the functionality of your filter code, by writing a small test program like
 345
 346
 347    use pod;
 348    $res =pod->test($ARGV[0],
 349                    (tabPath=>'.:../../tab:../../../yaz/tab'));
 350
 351 This will try to apply the filter on the file provided as argument, and display the generated data1 structure. However, there are some differences, when running a filter in test mode:
 352  - The include path is not applied from tabPath
 353  - the tellf, and endf functions are not implemented (just ignored)
 354
 355 =head1 CONFIGURE ZEBRA TO USE A PERL FILTER
 356
 357 This is quite simple. Read the Zebra manual, and follow the instructions to create your zebra.cfg. For your I<recordType> choose 'grs.perl.<YourFilterClass>'.
 358 Copy your filter module (YourFilterClass.pm) to a directory listed in I<profilePath>. i<profilePath> is added to @INC, when interpreting your package: so if you need to load modules from different locations than the default perl include path, just add these directories.
 359
 360 =head1 MISC OPTIONS
 361
 362 By default, filter code (process method) is executed within an eval {} block, and only a warning is sent to the log, if there is an error. To turn this option off, set B<$IDZebra::Filter::SAFE_MODE> to B<0>;
 363
 364 =head1 TODO
 365
 366 Finish virtual (tied) filehandle methods (SEEK, EOF, TELL);
 367
 368 =head1 COPYRIGHT
 369
 370 Fill in
 371
 372 =head1 AUTHOR
 373
 374 Peter Popovics, pop@technomat.hu
 375
 376 =head1 SEE ALSO
 377
 378 IDZebra, IDZebra::Data1, Zebra documentation
 379
 380 =cut